import pandas as pd
import numpy as np
import math
import copy
import random
import time
from scipy.interpolate import BSpline
import matplotlib.pyplot as plt
import matplotlib
import multiprocessing as mp
import cProfile
import re
import timeit
import warnings
import shutil
import os
from glob import glob
import gc
import xlsxwriter
import pywt
import pycwt as wavelet
import tensorflow as tf
from sklearn.preprocessing import MinMaxScaler
from sklearn.datasets import make_regression
from keras import models, Sequential, optimizers, backend
from keras.layers import core, Conv2D, pooling, Flatten, Dense, Dropout, BatchNormalization
from keras.callbacks import ModelCheckpoint, CSVLogger, TensorBoard
from keras.utils import Sequence
import skimage
from skimage.io import imread
from skimage.transform import resize
# Increase TensorFlow's GPU memory usage in order to be able to run (0.9 seems to result in frequent crashes)
config = tf.ConfigProto()
config.gpu_options.allocator_type ='BFC'
config.gpu_options.per_process_gpu_memory_fraction = 0.9
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())
os.uname()
def get_fileInfo():
# Get the directory for each .dat file and store them in file_paths
# Choose directory depending on the OS I'm running (similar code is found through the entire project)
if "Darwin" in os.uname():
directory = "/Users/jakob/Desktop/Master Project/git/carmenes/"
else:
directory = "/home/salomonsson/Master Project/git/carmenes/"
file_paths = glob(directory + "*.dat")
# Get the .dat file names
file_names = []
for root, dirs, files in os.walk(directory):
for filename in files:
if filename.endswith(".dat"):
filename = filename[:-10] # Remove unneccessary "common" information from the name
file_names.append(filename)
return file_names, file_paths
file_names, file_paths = get_fileInfo()
def load_all_data():
# Get the data from each .dat file
all_loaded_data = [] # Create a list to store all the data.
for i in range(len(file_paths)):
data = pd.read_table(file_paths[i], sep=" ", header=None)
data.index.names = [file_names[i]] # Add the file name to the dataframe
data.dropna(inplace=True) # Remove all NaN-values and their respective rows
limits = (data[data.values =="#"]) # Find all the indeces with a "#"
data_pieces, lim_count = [], 0
for i in range(len(limits)):
start = limits.index[lim_count]
if lim_count == (len(limits)-1):
d_piece = data.loc[start:, :1] # The last data piece contains the last order in the df data
else:
d_piece = data.loc[start:limits.index[lim_count+1], :1] # Select the correct data piece
d_piece.index.names = [str(d_piece.index.names)[2:-2] + "_Order_" + str(lim_count)] # add order to name
d_piece = d_piece[d_piece.iloc[:,0] != "#"] # Remove all rows with a "#" in data_piece
d_piece.columns = ["Angstrom", "Flux"]
data_pieces.append(d_piece)
lim_count += 1
all_loaded_data.append(data_pieces)
return all_loaded_data
all_loaded_data = load_all_data()
pd.options.display.max_rows = 50000
np.set_printoptions(threshold = 100000)
#Plot these:
a = 1
b = 2
c = 3
test_data1 = pd.read_table(file_paths[a], sep=" ", header=None, skiprows=1, engine='python', na_values={"#", 'o='})
test_data2 = pd.read_table(file_paths[b], sep=" ", header=None, skiprows=1, engine='python', na_values={"#", 'o='})
test_data3 = pd.read_table(file_paths[c], sep=" ", header=None, skiprows=1, engine='python', na_values={"#", 'o='})
print(file_paths[a])
print(file_paths[b])
print(file_paths[c][-45:-4])
display(test_data1.head(10))
# Plot it
plt.figure(figsize=(15,8))
plt.plot(test_data1.iloc[:,0], test_data1.iloc[:,1], color='green')
plt.plot(test_data2.iloc[:,0], test_data2.iloc[:,1], color='red')
plt.plot(test_data3.iloc[:,0], test_data3.iloc[:,1], color='black')
plt.title("Carmenes NIR, order 1", fontsize=20, weight=0)
plt.ylabel("Flux", fontsize=15, weight=0)
plt.xlabel("Angstrom", fontsize=15, weight=0)
plt.legend([file_paths[a][-45:-4], file_paths[b][-45:-4], file_paths[c][-45:-4]], loc="upper right", fontsize=12)
plt.xticks(fontsize=12, weight=0)
plt.yticks(fontsize=12, weight=0)
plt.autoscale(enable=True, axis="x", tight=True)
plt.ylim((-0.01,1))
plt.show()
#Plot these:
aa = 4
bb = 6
cc = 7
test_data11 = pd.read_table(file_paths[aa], sep=" ", header=None, skiprows=1, engine='python', na_values={"#", 'o='})
test_data22 = pd.read_table(file_paths[bb], sep=" ", header=None, skiprows=1, engine='python', na_values={"#", 'o='})
test_data33 = pd.read_table(file_paths[cc], sep=" ", header=None, skiprows=1, engine='python', na_values={"#", 'o='})
print(file_paths[aa])
print(file_paths[bb])
print(file_paths[cc][-45:-4])
# Plot it
plt.figure(figsize=(15,8))
plt.plot(test_data11.iloc[:,0], test_data11.iloc[:,1], color='green')
plt.plot(test_data22.iloc[:,0], test_data22.iloc[:,1], color='red')
plt.plot(test_data33.iloc[:,0], test_data33.iloc[:,1], color='black')
plt.title("Carmenes VIS, order 1", fontsize=20, weight=0)
plt.ylabel("Flux", fontsize=15, weight=0)
plt.xlabel("Angstrom", fontsize=15, weight=0)
plt.legend([file_paths[a][-45:-4], file_paths[b][-45:-4], file_paths[c][-45:-4]], loc="lower left", fontsize=12)
plt.xticks(fontsize=12, weight=0)
plt.yticks(fontsize=12, weight=0)
plt.autoscale(enable=True, axis="x", tight=True)
plt.ylim((-1e7,1.3e7))
plt.show()
print(len(file_paths))
print(all_loaded_data[1][0].iloc[2,:])
def seperate_data(data):
# Create two lists, one with all the visual (vis) values and one for all the near infrared values (nir)
all_loaded_data_VIS = []
all_loaded_data_NIR = []
for i in range(len(data)):
if "vis" in str(data[i][0].index.names):
all_loaded_data_VIS.append(copy.deepcopy(data[i]))
elif "nir" in str(data[i][0].index.names):
all_loaded_data_NIR.append(copy.deepcopy(data[i]))
else:
pass
return all_loaded_data_VIS, all_loaded_data_NIR
all_loaded_data_VIS, all_loaded_data_NIR = seperate_data(all_loaded_data)
print(len(all_loaded_data_VIS))
print(len(all_loaded_data_VIS[0][0]))
print(len(all_loaded_data_VIS[0]))
That is, all order 0 start and ends at the same indeces, all order 1 start and ends at the same indeces, etc. etc. The chosen start index is the largest index among the start indeces and the end index is chosen as the smallest among the end indices.
"""Visual wavelength (VIS) data"""
for i in range(len(all_loaded_data_VIS[0])):
all_min_VIS_values, all_max_VIS_values = [], []
for j in range(len(all_loaded_data_VIS)):
all_min_VIS_values.append(all_loaded_data_VIS[j][i].index[0])
all_max_VIS_values.append(all_loaded_data_VIS[j][i].index[-1])
VIS_min, VIS_max = max(all_min_VIS_values), min(all_max_VIS_values) # choose the min and max limits for each order
# Align the values for all the VIS data (according to VIS_min and VIS_max) so they match each other.
for h in range(len(all_loaded_data_VIS)):
all_loaded_data_VIS[h][i] = all_loaded_data_VIS[h][i].loc[VIS_min:VIS_max]
"""Near Infrared (NIR) data"""
for k in range(len(all_loaded_data_NIR[0])):
all_min_NIR_values, all_max_NIR_values = [], []
for m in range(len(all_loaded_data_NIR)):
all_min_NIR_values.append(all_loaded_data_NIR[m][k].index[0])
all_max_NIR_values.append(all_loaded_data_NIR[m][k].index[-1])
NIR_min, NIR_max = max(all_min_NIR_values), min(all_max_NIR_values) # choose the min and max limits for each order
# Align the values for all the NIR data (according to NIR_min and NIR_max) so they match each other.
for n in range(len(all_loaded_data_NIR)):
all_loaded_data_NIR[n][k] = all_loaded_data_NIR[n][k].loc[NIR_min:NIR_max]
print(all_loaded_data_VIS[0][0].iloc[:10,])
print(all_loaded_data_NIR[0][0].iloc[:10,])
print(all_loaded_data_NIR[1][0].iloc[:10,])
(The run will take some 20 min in total)
# Create new lists for the average values (a copy of the normal ones).
all_loaded_data_VIS_average = copy.deepcopy(all_loaded_data_VIS)
all_loaded_data_NIR_average = copy.deepcopy(all_loaded_data_NIR)
import re
from itertools import groupby
# Define some functions
def create_file_path(name):
""" Create file path to store files"""
if "Darwin" in os.uname(): # if it's a Mac
base = "/Users/jakob/Desktop/Master Project/git/carmenes_normalised/"
else:
base = "/home/salomonsson/Master Project/git/carmenes_normalised/"
return(base + name + ".dat")
def sorted_nicely(l):
""" Sort the list in a smarter way."""
convert = lambda text: int(text) if text.isdigit() else text
alphanum_key = lambda key: [ convert(c) for c in re.split('([0-9]+)', key) ]
return l.sort(key = alphanum_key)
# https://stackoverflow.com/questions/2669059/how-to-sort-alpha-numeric-set-in-python
def sorted_nicely_short(s):
return [int(''.join(g)) if k else ''.join(g) for k, g in groupby('\0'+s, str.isdigit)]
# https://stackoverflow.com/questions/2669059/how-to-sort-alpha-numeric-set-in-python
# Choose directory depending on the OS I'm running
if "Darwin" in os.uname():
d = "/Users/jakob/Desktop/Master Project/git/carmenes_normalised/"
else:
d = "/home/salomonsson/Master Project/git/carmenes_normalised/"
quantity1 = len(all_loaded_data_VIS_average) * len(all_loaded_data_VIS_average[0])
for dirpath, dirnames, files in os.walk(d): # Get the amount of VIS files in the directory d
vis_files = []
for i in files:
if "vis" in i:
vis_files.append(i)
if len(vis_files) != quantity1: # if all normalised VIS files arent in d, normalise all_loaded_data_VIS
start1 = time.time()
"""Visual wavelength (VIS) data"""
for a in range(len(all_loaded_data_VIS_average[0])):
for i in range(len(all_loaded_data_VIS_average[0][0])):
all_VIS_values = []
for j in range(len(all_loaded_data_VIS_average)):
all_VIS_values.append(float(all_loaded_data_VIS_average[j][a].iloc[i, 0]))
VIS_average = np.mean(all_VIS_values) # Get the average
# Replace the old values with the VIS_average.
for h in range(len(all_loaded_data_VIS_average)):
all_loaded_data_VIS_average[h][a].iloc[i, 0] = VIS_average
# Check how long time it took.
end1 = time.time()
tt1 = end1 - start1
print("Total run time: {0:0.0f} min and {1:0.0f} s ".format((tt1-tt1%60)/60, tt1%60))
# Make sure all values are float values
for i in range(len(all_loaded_data_VIS_average)):
for j in range(len(all_loaded_data_VIS_average[i])):
all_loaded_data_VIS_average[i][j].iloc[:,0]=list(map(float, all_loaded_data_VIS_average[i][j].iloc[:,0]))
all_loaded_data_VIS_average[i][j].iloc[:,1]=list(map(float, all_loaded_data_VIS_average[i][j].iloc[:,1]))
# Save the normalised VIS data in separate files
for i in range(len(all_loaded_data_VIS_average)):
for j in range(len(all_loaded_data_VIS_average[0])):
name = all_loaded_data_VIS_average[i][j].index.name
all_loaded_data_VIS_average[i][j].to_csv(create_file_path(name), header=False, index=True)
else:
print("Normalised VIS files already exists, download them instead to save time.")
# Get the file names and save them to a list
vis_files = []
for file_name in files:
if "vis" in file_name:
file_name = file_name[:-4] # Remove the .dat notation in the name
vis_files.append(file_name)
quantity2 = len(all_loaded_data_NIR_average) * len(all_loaded_data_NIR_average[0])
for dirpath, dirnames, files in os.walk(d):
nir_files = []
for i in files:
if "nir" in i:
nir_files.append(i)
if len(nir_files) != quantity2: # if not all normalised NIR files are in d, normalise all_loaded_data_NIR
start2 = time.time()
"""Near Infrared (NIR) data"""
for b in range(len(all_loaded_data_NIR_average[0])):
for k in range(len(all_loaded_data_NIR_average[0][0])):
all_NIR_values = []
for m in range(len(all_loaded_data_NIR_average)):
all_NIR_values.append(float(all_loaded_data_NIR_average[m][b].iloc[k, 0]))
NIR_average = np.mean(all_NIR_values) # Get the average
# Replace the old values with the NIR_average.
for n in range(len(all_loaded_data_NIR_average)):
all_loaded_data_NIR_average[n][b].iloc[k, 0] = NIR_average
# Check how long time it took.
end2 = time.time()
tt2 = end2 - start2
print("Total run time: {0:0.0f} min and {1:0.0f} s ".format((tt2-tt2%60)/60, tt2%60))
# Make sure all values are float values
for i in range(len(all_loaded_data_NIR_average)):
for j in range(len(all_loaded_data_NIR_average[i])):
all_loaded_data_NIR_average[i][j].iloc[:,0]=list(map(float, all_loaded_data_NIR_average[i][j].iloc[:,0]))
all_loaded_data_NIR_average[i][j].iloc[:,1]=list(map(float, all_loaded_data_NIR_average[i][j].iloc[:,1]))
# Save the normalised NIR data in separate files
for x in range(len(all_loaded_data_NIR_average)):
for y in range(len(all_loaded_data_NIR_average[0])):
name = all_loaded_data_NIR_average[x][y].index.name
all_loaded_data_NIR_average[x][y].to_csv(create_file_path(name), header=False, index=True)
else:
print("All the NIR files are already normalised, download them instead.")
# Get the file names and save them to a list
nir_files = []
for file_name in files:
if "nir" in file_name:
file_name = file_name[:-4] # Remove the .dat notation in the name
nir_files.append(file_name)
# Sort the lists
vis_files = sorted(vis_files, key=sorted_nicely_short)
nir_files = sorted(nir_files, key=sorted_nicely_short)
files = sorted(files, key=sorted_nicely_short)
#sorted_nicely(vis_files)
#sorted_nicely(nir_files)
#sorted_nicely(files)
# Get the directory for each .dat file and store them in file_paths
if "Darwin" in os.uname():
d = "/Users/jakob/Desktop/Master Project/git/carmenes_normalised/"
else:
d = "/home/salomonsson/Master Project/git/carmenes_normalised/"
file_paths = glob(d + "*.dat")
file_paths = sorted(file_paths, key=sorted_nicely_short)
# Save NIR and VIS in separate lists
file_paths_NIR, file_paths_VIS = [], []
for path in file_paths:
if "nir" in path:
file_paths_NIR.append(path)
elif "vis" in path:
file_paths_VIS.append(path)
def combine_Carmenes_Orders(folder, NIR_or_VIS):
# A function to combine and plot the carmenes data with all orders included
if NIR_or_VIS.upper() == "NIR":
tp = "/NIR/"
elif NIR_or_VIS.upper() == "VIS":
tp = "/VIS/"
else:
print("Something is not correct")
if "Darwin" in os.uname():
d = "/Users/jakob/Desktop/Master Project/git/Carmenes_norm_plots/" + folder + tp
else:
d = "/home/salomonsson/Master Project/git/Carmenes_norm_plots/" + folder + tp
file_paths_Carm = glob(d + "*.dat")
file_paths_Carm = sorted(file_paths_Carm, key=sorted_nicely_short)
list_of_df = []
for path in file_paths_Carm:
data = pd.read_table(path, sep=',', header=None)
list_of_df.append(data)
final_data = pd.concat(list_of_df)
final_data.index.names = [str(path)[70:-13]] # add name
final_data.columns = ["Index", "Angstrom", "Flux"]
return final_data
NIR_data1 = combine_Carmenes_Orders("plot1", "NIR")
NIR_data2 = combine_Carmenes_Orders("plot2", "NIR")
NIR_data3 = combine_Carmenes_Orders("plot3", "NIR")
VIS_data1 = combine_Carmenes_Orders("plot1", "VIS")
VIS_data2 = combine_Carmenes_Orders("plot2", "VIS")
VIS_data3 = combine_Carmenes_Orders("plot3", "VIS")
# Plot it
plt.figure(figsize=(15,8))
plt.plot(NIR_data3.iloc[:,1], NIR_data3.iloc[:,2], color='green')
plt.plot(NIR_data1.iloc[:,1], NIR_data1.iloc[:,2], color='red')
plt.plot(NIR_data2.iloc[:,1], NIR_data2.iloc[:,2], color='black')
plt.title("Normalised Carmenes NIR", fontsize=20, weight=0)
plt.ylabel("Flux", fontsize=18, weight=0)
plt.xlabel("Angstrom", fontsize=18, weight=0)
plt.legend([str(NIR_data3.index.names)[2:-2],
str(NIR_data1.index.names)[2:-2],
str(NIR_data2.index.names)[2:-2]], loc="upper right", fontsize=15)
plt.xticks(fontsize=15, weight=0)
plt.yticks(fontsize=15, weight=0)
plt.autoscale(enable=True, axis="x", tight=True)
plt.ylim((-0.01,1))
plt.show()
# Plot it
plt.figure(figsize=(15,8))
plt.plot(VIS_data3.iloc[:,1], VIS_data3.iloc[:,2], color='green')
plt.plot(VIS_data1.iloc[:,1], VIS_data1.iloc[:,2], color='red')
plt.plot(VIS_data2.iloc[:,1], VIS_data2.iloc[:,2], color='black')
plt.title("Normalised Carmenes VIS", fontsize=20, weight=0)
plt.ylabel("Flux", fontsize=18, weight=0)
plt.xlabel("Angstrom", fontsize=18, weight=0)
plt.legend([str(VIS_data3.index.names)[2:-2],
str(VIS_data1.index.names)[2:-2],
str(VIS_data2.index.names)[2:-2]], loc="lower left", fontsize=15)
plt.xticks(fontsize=15, weight=0)
plt.yticks(fontsize=15, weight=0)
plt.autoscale(enable=True, axis="x", tight=True)
plt.ylim((-1e7,1.3e7))
plt.show()
print(len(files))
#print(files)
print(len(file_paths))
print(len(nir_files))
print(len(vis_files))
# Load the data from the .dat files
# VIS
nbr = 0
for i in range(len(all_loaded_data_VIS_average)):
for j in range(len(all_loaded_data_VIS_average[i])):
all_loaded_data_VIS_average[i][j] = pd.read_csv(file_paths_VIS[nbr], names=["Angstrom", "Flux"])
all_loaded_data_VIS_average[i][j].index.names = [vis_files[nbr]]
nbr += 1
# NIR
nbr = 0
for i in range(len(all_loaded_data_NIR_average)):
for j in range(len(all_loaded_data_NIR_average[i])):
all_loaded_data_NIR_average[i][j] = pd.read_csv(file_paths_NIR[nbr], names=["Angstrom", "Flux"])
all_loaded_data_NIR_average[i][j].index.names = [nir_files[nbr]]
nbr += 1
# VIS + NIR (this will be easier to use in future pre-processing steps)
all_loaded_data_average = []
nbr = 0
for i in range(len(all_loaded_data_VIS_average)):
for j in range(len(all_loaded_data_VIS_average[i])):
data_comb = pd.read_csv(file_paths[nbr], names=["Angstrom", "Flux"])
data_comb.index.names = [files[nbr]]
all_loaded_data_average.append(data_comb)
nbr += 1
for i in range(len(all_loaded_data_NIR_average)):
for j in range(len(all_loaded_data_NIR_average[i])):
data_comb = pd.read_csv(file_paths[nbr], names=["Angstrom", "Flux"])
data_comb.index.names = [files[nbr]]
all_loaded_data_average.append(data_comb)
nbr += 1
#print(all_loaded_data_NIR_average[0][0].iloc[0,:])
L = []
L.append(all_loaded_data_NIR_average[0][0].iloc[0,:])
L.append(all_loaded_data_NIR_average[0][0].iloc[1,:])
L.append(all_loaded_data_NIR_average[0][0].iloc[2,:])
df = pd.DataFrame(L)
#print(L)
display(df)
#temp = all_loaded_data_NIR_average[0][0].iloc[0,:]
#temp["Flux"] = 23
#print(temp)
Convert the Flux values from string to float.
# For Visual data
for i in range(len(all_loaded_data_VIS_average)):
for j in range(len(all_loaded_data_VIS_average[i])):
all_loaded_data_VIS_average[i][j].iloc[:,1] = list(map(float, all_loaded_data_VIS_average[i][j].iloc[:,1]))
# For Near Infrared data
for i in range(len(all_loaded_data_NIR_average)):
for j in range(len(all_loaded_data_NIR_average[i])):
all_loaded_data_NIR_average[i][j].iloc[:,1] = list(map(float, all_loaded_data_NIR_average[i][j].iloc[:,1]))
# For the combined (VIS + NIR) data
for j in range(len(all_loaded_data_average)):
all_loaded_data_average[j].iloc[:,0] = list(map(float, all_loaded_data_average[j].iloc[:,0]))
all_loaded_data_average[j].iloc[:,1] = list(map(float, all_loaded_data_average[j].iloc[:,1]))
print(len(all_loaded_data_average))
#print(all_loaded_data_average[100])
print(type(all_loaded_data_VIS_average[0][0].iloc[0,0]))
print(all_loaded_data_VIS_average[0][0].iloc[:10,])
print(all_loaded_data_VIS_average[1][0].iloc[:10,])
print(all_loaded_data_NIR_average[0][0].iloc[:10,])
print(all_loaded_data_NIR_average[1][0].iloc[:10,])
alpha is neglected due to its small influence. Its also partly included in the metallicity, and as such, will make a very small difference when neglected.
x1 = all_loaded_data_VIS_average[0][0].iloc[:, 0]
y1 = all_loaded_data_VIS_average[0][0].iloc[:, 1]
x2 = all_loaded_data_VIS_average[1][0].iloc[:, 0]
y2 = all_loaded_data_VIS_average[1][0].iloc[:, 1]
x3 = all_loaded_data_VIS_average[2][0].iloc[:, 0]
y3 = all_loaded_data_VIS_average[2][0].iloc[:, 1]
plt.figure(figsize=(15,8))
plt.plot(x1.values, y1.values)
plt.plot(x2.values, y2.values)
plt.plot(x3.values, y3.values)
plt.title("Carmenes VIS", fontsize=20)
plt.ylabel("Flux", fontsize=18)
plt.xlabel("Angstrom", fontsize=18)
plt.legend([x1.index.names[0], x2.index.names[0], x2.index.names[0]], loc="upper left", fontsize=15)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.autoscale(enable=True, axis="x", tight=True)
plt.show()
xx1 = all_loaded_data_NIR_average[0][0].iloc[:, 0]
yy1 = all_loaded_data_NIR_average[0][0].iloc[:, 1]
xx2 = all_loaded_data_NIR_average[1][0].iloc[:, 0]
yy2 = all_loaded_data_NIR_average[1][0].iloc[:, 1]
xx3 = all_loaded_data_NIR_average[2][0].iloc[:, 0]
yy3 = all_loaded_data_NIR_average[2][0].iloc[:, 1]
plt.figure(figsize=(15,8))
plt.plot(xx1.values, yy1.values)
plt.plot(xx2.values, yy2.values)
plt.plot(xx3.values, yy3.values)
plt.title("Carmenes NIR", fontsize=20)
plt.ylabel("Flux", fontsize=18)
plt.xlabel("Angstrom", fontsize=18)
plt.legend([xx1.index.names[0], xx2.index.names[0], xx2.index.names[0]], loc="upper right", fontsize=15)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.autoscale(enable=True, axis="x", tight=True)
plt.show()
#(The big step (hole) below is just one step)
NIR_data1 = combine_Carmenes_Orders("plot1", "NIR")
NIR_data2 = combine_Carmenes_Orders("plot2", "NIR")
NIR_data3 = combine_Carmenes_Orders("plot3", "NIR")
VIS_data1 = combine_Carmenes_Orders("plot1", "VIS")
VIS_data2 = combine_Carmenes_Orders("plot2", "VIS")
VIS_data3 = combine_Carmenes_Orders("plot3", "VIS")
# Plot it
plt.figure(figsize=(15,8))
plt.plot(NIR_data3.iloc[:,1], NIR_data3.iloc[:,2], color='green')
plt.plot(NIR_data1.iloc[:,1], NIR_data1.iloc[:,2], color='red')
plt.plot(NIR_data2.iloc[:,1], NIR_data2.iloc[:,2], color='black')
plt.title("Normalised Carmenes NIR", fontsize=20, weight=0)
plt.ylabel("Flux", fontsize=18, weight=0)
plt.xlabel("Angstrom", fontsize=18, weight=0)
plt.legend([str(NIR_data3.index.names)[2:-2],
str(NIR_data1.index.names)[2:-2],
str(NIR_data2.index.names)[2:-2]], loc="upper right", fontsize=15)
plt.xticks(fontsize=15, weight=0)
plt.yticks(fontsize=15, weight=0)
plt.autoscale(enable=True, axis="x", tight=True)
plt.ylim((-0.01,1))
plt.show()
# Plot it
plt.figure(figsize=(15,8))
plt.plot(VIS_data3.iloc[:,1], VIS_data3.iloc[:,2], color='green')
plt.plot(VIS_data1.iloc[:,1], VIS_data1.iloc[:,2], color='red')
plt.plot(VIS_data2.iloc[:,1], VIS_data2.iloc[:,2], color='black')
plt.title("Normalised Carmenes VIS", fontsize=20, weight=0)
plt.ylabel("Flux", fontsize=18, weight=0)
plt.xlabel("Angstrom", fontsize=18, weight=0)
plt.legend([str(VIS_data3.index.names)[2:-2],
str(VIS_data1.index.names)[2:-2],
str(VIS_data2.index.names)[2:-2]], loc="lower left", fontsize=15)
plt.xticks(fontsize=15, weight=0)
plt.yticks(fontsize=15, weight=0)
plt.autoscale(enable=True, axis="x", tight=True)
plt.ylim((-1e7,1.3e7))
plt.show()
def get_btsettl_info(NIR_or_VIS):
# Get the directory for each .dat file and store them in file_paths
if NIR_or_VIS.upper() == "NIR":
f = "NIR/"
elif NIR_or_VIS.upper() == "VIS":
f = "VIS/"
else:
print("Some kind of in(out)put error has occured")
if "Darwin" in os.uname():
dir_settl = "/Users/jakob/Desktop/Master Project/bt-settl (new version)/" + f
else:
dir_settl = "/opt/bt-settl/"
paths_settl = glob(dir_settl + "*.txt") # all the sub directories and all the .dat files
# Get the .dat file names
file_names_settl = []
for root, dirs, files_bt in os.walk(dir_settl):
for filename in files_bt:
if filename.endswith(".txt"):
f = filename.split(".BT")[0] # Remove unneccessary "common" information from the name
file_names_settl.append(f)
file_names_settl = sorted(file_names_settl, key=sorted_nicely_short)
paths_settl = sorted(paths_settl, key=sorted_nicely_short)
return file_names_settl, paths_settl
file_names_settl, paths_settl = get_btsettl_info('NIR')
print(len(paths_settl))
def get_btsettl_data(position):
""" Returns the btsettl dataframe at position "position". """
data = pd.read_table(paths_settl[position], sep='\s+', header=None, skiprows=8)
data.index.names = [file_names_settl[position]] # Add the file name to the dataframe
data.dropna(inplace=True) # Remove NaN-values
data.columns = ["Angstrom", "Flux"]
return data
def create_btsettl_path(name):
""" Create file path to store files"""
if "Jakobs-MBP.home" in os.uname():
base = "/Users/jakob/Desktop/Master Project/bt-settl_normalised/"
else:
base = "/home/salomonsson/Master Project/bt-settl/"
return(base + name + ".dat")
def create_btsettl_path_CLEAN(name):
""" Create file path to store files"""
if "Jakobs-MBP.home" in os.uname():
base = "/Users/jakob/Desktop/Master Project/bt-settl-CLEAN/"
else:
base = "/home/salomonsson/Master Project/bt-settl-CLEAN/"
return(base + name + ".dat")
def get_btsettl_data_byName(name):
""" Returns the btsettl dataframe with the name "name". """
for path in paths_settl:
if name in path:
data = pd.read_table(path, sep='\s+', header=None, skiprows=8)
data.index.names = [name] # Add the file name to the dataframe
data.dropna(inplace=True) # Remove NaN-values (if any)
data.columns = ["Angstrom", "Flux"]
return data
## https://stackoverflow.com/questions/2566412/find-nearest-value-in-numpy-array # + added functionality
def find_nearest(array,value):
'''Given an ``array`` , and given a ``value`` , returns a value j such that ``value`` is between array[j]
and array[j+1]. ``array`` must be monotonic increasing. j=-1 or j=len(array) is returned
to indicate that ``value`` is out of range below and above respectively.'''
n = len(array)
if (value < array[0]):
return -1
elif (value > array[n-1]):
return n
jl = 0 # Initialize lower
ju = n-1 # and upper limits.
while (ju-jl > 1): # If we are not yet done,
jm=(ju+jl) >> 1 # compute a midpoint with a bitshift
if (value >= array[jm]):
jl=jm # and replace either the lower limit
else:
ju=jm # or the upper limit, as appropriate.
# Repeat until the test condition is satisfied.
up_dif = np.abs(value-array[jl-1])
down_dif = np.abs(value-array[jl+1])
if (value == array[0]): # edge cases at bottom
return 0
elif (value == array[n-1]): # and top
return n-1
elif up_dif > down_dif:
return array[jl+1]
elif up_dif < down_dif:
return array[jl-1]
else:
return array[jl]
xx1 = get_btsettl_data_byName(file_names_settl[0])
yy1 = get_btsettl_data_byName(file_names_settl[0])
xx2 = get_btsettl_data_byName(file_names_settl[1])
yy2 = get_btsettl_data_byName(file_names_settl[1])
xx3 = get_btsettl_data_byName(file_names_settl[2])
yy3 = get_btsettl_data_byName(file_names_settl[2])
plt.figure(figsize=(15,8))
plt.plot(xx1.iloc[:, 0].values, yy1.iloc[:, 1].values, color='black')
plt.plot(xx2.iloc[:, 0].values, yy2.iloc[:, 1].values, color='red')
plt.plot(xx3.iloc[:, 0].values, yy3.iloc[:, 1].values, color='green')
plt.title("Raw BT-Settl, Full Spectra", fontsize=20, weight=0)
plt.ylabel("Flux", fontsize=18, weight=0)
plt.xlabel("Angstrom", fontsize=18, weight=0)
plt.legend([xx1.index.names[0], xx2.index.names[0], xx3.index.names[0]], loc="upper right", fontsize=15)
plt.xticks(fontsize=15, weight=0)
plt.yticks(fontsize=15, weight=0)
plt.autoscale(enable=True, axis="x", tight=True)
plt.ylim((0,0.001))
plt.show()
print(len(all_loaded_data_VIS_average[0][0]))
print(len(all_loaded_data_VIS_average[0]))
print(len(all_loaded_data_NIR_average[0][0]))
print(len(all_loaded_data_NIR_average[0]))
""" Displaying that Angstrom values are monotonic increasing ==> by doing this, 'find_nearest' can be used """
for j in range(len(all_loaded_data_VIS_average[0])):
for i in range(len(all_loaded_data_VIS_average[0][j])-1):
if all_loaded_data_VIS_average[0][j].values[i,0] <= all_loaded_data_VIS_average[0][j].values[i+1,0]:
pass
else:
print("NOT ok VIS")
for j in range(len(all_loaded_data_NIR_average[0])):
for i in range(len(all_loaded_data_NIR_average[0][j])-1):
if all_loaded_data_NIR_average[0][j].values[i,0] <= all_loaded_data_NIR_average[0][j].values[i+1,0]:
pass
else:
print("NOT ok NIR")
print("Everythings fine")
print(len(file_names_settl))
# Make some local copies
data_VIS_avg_LOCAL = copy.deepcopy(all_loaded_data_VIS_average)
data_NIR_avg_LOCAL = copy.deepcopy(all_loaded_data_NIR_average)
len(data_NIR_avg_LOCAL[0])
def create_btsettl_data_FINAL(carmenes_data, file_names, output):
""" Convolution of the BT-Settl data set.
'file_names' is a list of BT-Settl data file names, output is a string, either "NIR" or "VIS". """
if output.upper() == "NIR":
light = "_NIR"
elif output.upper() == "VIS":
light = "_VIS"
else:
print("Some kind of in(out)put error has occured")
nv, R = 50, 82000
for name in file_names:
order = 0
data = get_btsettl_data_byName(name)
name = name + light
for m in range(len(carmenes_data[0])):
small_list = []
for n in range(len(carmenes_data[0][0])):
"""Perform the necessary calculations"""
# Get a wavelength value from the carmenes dataset,
# store it in the variable 'look_here'
look_here = float(copy.deepcopy(carmenes_data[0][m].values[n,0]))
# Select the BT-Settl wavelengths
ar = data['Angstrom'].values
# Get the n’th wavelength value from the carmenes dataset
# that is closest to the 'look_here' variable
use_this = find_nearest(ar,look_here)
# Get the corresponding index
itemindex = np.where(ar==use_this)[0]
# Convert to integers and floats
ind = int(itemindex[int(0)])
use_this = float(use_this)
# Convolution intervals
xsg1 = ind - nv
xsg2 = ind + nv
# Apply these intervals to the BT-Settl data
xsgA = data["Angstrom"].iloc[xsg1:(xsg2+1)]
xsgF = data["Flux"].iloc[(xsg1-1):xsg2]
# Calculate the impact from the chosen BT-Settl flux values
xt = copy.deepcopy(use_this)
sgm = float(use_this/(R*2*np.sqrt(2.*np.log(2.))))
flt = np.exp(-(xsgA - xt)**2/(2*sgm**2))/(np.sqrt(2*np.pi)*sgm)
# Calculate the sum of this impact, reverse xsgF before multiplying
xsgA2 = data["Angstrom"].iloc[(xsg1-1):(xsg2+1)]
the_sum = np.sum(np.diff(xsgA2)*flt*xsgF[::-1])
# Add the newly calculated flux to the Carmenes data on the appropriate position
temp = copy.deepcopy(carmenes_data[0][m].iloc[n,:])
temp["Flux"] = the_sum
small_list.append(temp)
# Add some space
temp_list = []
temp_list.append(' ')
temp_list.append(' ')
temp_list.append("#order: {0}".format(order))
pd.DataFrame(temp_list).to_csv(create_btsettl_path(name),
header=False, index=False, mode="a")
order += 1
# Create the dataframe
df = pd.DataFrame(small_list)
# Convert the df to numerical values
df = df.apply(pd.to_numeric, errors='ignore')
# Calculate the Rolling mean for the Flux and equal the "area below" to 1.
temp_df = (df["Flux"].rolling(2).mean()[1:]*(np.diff(df["Angstrom"])))
df["Flux"] = df["Flux"]/temp_df.sum()
# Write to file
df.to_csv(create_btsettl_path(name), header=False, index=False, mode="a")
######################%timeit create_btsettl_data_FINAL(data_NIR_avg_LOCAL, file_names_settl[1:2], "NIR")
len(file_names_settl)
from multiprocessing import Process
start1 = time.time()
#if __name__ == '__main__':
# try:
# mp.set_start_method('fork')
# except RuntimeError:
# pass
# Number of cores to use
number_of_cores = 15
files_per_core = 7
# Setup a list of processes that we want to run
# ITS IMPORTANT THAT file_names_settl IS DEFINED WITHIN A RANGE RATHER THAN ONE SPECIFIC POSITION
file_nbr = 3000
for _ in range(files_per_core):
processes = []
for _ in range(number_of_cores):
prr = Process(target=create_btsettl_data_FINAL,
args=(data_NIR_avg_LOCAL,
file_names_settl[file_nbr:(file_nbr+1)], "NIR"))
processes.append(prr)
file_nbr += 1
# Run processes
for p in processes:
p.start()
# Exit the completed processes
for p in processes:
p.join()
# Check how long time it took.
end1 = time.time()
tt1 = end1 - start1
print("Total run time: {0:0.0f} min and {1:3.0f} s ".format((tt1-tt1%60)/60, tt1%60))
from multiprocessing import Process
# Time the process
start1 = time.time()
#if __name__ == '__main__':
# try:
# mp.set_start_method('fork')
# except RuntimeError:
# pass
number_of_cores = 15
files_per_core = 4
# Setup a list of processes that we want to run
file_nbr = 3000
for _ in range(files_per_core):
processes = []
for _ in range(number_of_cores):
prr = Process(target=create_btsettl_data_FINAL,
args=(data_VIS_avg_LOCAL,
file_names_settl[file_nbr:(file_nbr+1)], "VIS"))
processes.append(prr)
file_nbr += 1
# Run processes
for p in processes:
p.start()
# Exit the completed processes
for p in processes:
p.join()
# Check how long time it took.
end1 = time.time()
tt1 = end1 - start1
print("Total run time: {0:0.0f} min and {1:3.0f} s ".format((tt1-tt1%60)/60, tt1%60))
def get_final_fileInfo122(NIR_or_VIS):
if NIR_or_VIS.upper() == "NIR":
tp = "NIR/"
elif NIR_or_VIS.upper() == "VIS":
tp = "VIS/"
else:
print("Something is not correct")
# Get the directory for each .dat file and store them in file_paths
# Choose directory depending on the OS I'm running (similar code is found through the entire project)
if "Darwin" in os.uname():
directory = "/Users/jakob/Desktop/Master Project/bt-settl/" + tp
else:
directory = "/home/salomonsson/Master Project/bt-settl-PLOT/" + tp
file_paths = glob(directory + "*.dat")
#print(file_paths)
# Get the .dat file names
file_names = []
for root, dirs, files in os.walk(directory):
for filename in files:
if filename.endswith(".dat"):
filename = filename[:-10] # Remove unneccessary "common" information from the name
file_names.append(filename)
return file_names, file_paths
NIR_new_final_file_names_NEW, NIR_new_final_file_paths_NEW = get_final_fileInfo122("NIR")
print("This many NIR files were loaded: {0}".format(len(NIR_new_final_file_names_NEW)))
def load_FINAL_data122(paths, names):
""" Load "this_many" files of the final pre-processed data"""
all_loaded_data = [] # Create a list to store the data.
all_loaded_data_NAMES = []
for i in range(len(paths)):
data = pd.read_table(paths[i], header=None, skip_blank_lines=True, comment='#', skiprows=3, sep=",")
data.index.names = [names[i]] # Add the file name to the dataframe
data.dropna(inplace=True) # Remove NaN-values (if any)
NoV = paths[i][-7:-4]
data.index.names = [str(data.index.names)[2:-2] + "_" + str(NoV)] # add NIR/VIS to name
data.columns = ["Angstrom", "Flux"]
all_loaded_data.append(data)
all_loaded_data_NAMES.append(data.index.names)
return all_loaded_data, all_loaded_data_NAMES
paths = NIR_new_final_file_paths_NEW
names = NIR_new_final_file_names_NEW
NIR_new_loaded_FINAL_data_NEW, NIR_new_loaded_final_NAMES_NEW = load_FINAL_data122(paths, names)
NIR_new_final_file_names_NEW, NIR_new_final_file_paths_NEW = get_final_fileInfo122("NIR")
pathsNIR = NIR_new_final_file_paths_NEW
namesNIR = NIR_new_final_file_names_NEW
NIR_new_loaded_FINAL_data_NEW, NIR_new_loaded_final_NAMES_NEW = load_FINAL_data122(pathsNIR, namesNIR)
for i in range(len(NIR_new_final_file_names_NEW)):
original_data = get_btsettl_data_byName(NIR_new_final_file_names_NEW[i])
print(original_data.iloc[:1,:])
xx_orig = original_data.iloc[250000:600000, 0].values
yy_orig = original_data.iloc[250000:600000, 1].values
plt.figure(figsize=(15,8))
plt.plot(xx_orig, yy_orig, color='green')
plt.title("Original Un-processed Data, NIR", fontsize=20, weight=0)
plt.xlim(xx_orig[0], xx_orig[-1])
plt.autoscale(enable=True, axis="x", tight=True)
loc = matplotlib.ticker.LinearLocator(numticks=5)
plt.gca().xaxis.set_major_locator(loc)
plt.xticks(fontsize=15, weight=0)
plt.yticks(fontsize=15, weight=0)
plt.show()
print(NIR_new_loaded_FINAL_data_NEW[i].iloc[:1,:])
xx = NIR_new_loaded_FINAL_data_NEW[i].iloc[:, 0].values
yy = NIR_new_loaded_FINAL_data_NEW[i].iloc[:, 1].values
plt.figure(figsize=(15,8))
plt.plot(xx, yy, color='green')
plt.autoscale(enable=True, axis="x", tight=True)
plt.title("Pre-processed Data, NIR", fontsize=20, weight=0)
plt.xticks(fontsize=15, weight=0)
plt.yticks(fontsize=15, weight=0)
plt.show()
print("=============================================================================================================")
print("=============================================== STARTING NEW ================================================")
print("=============================================================================================================")
print()
print()
print()
xx1 = NIR_new_loaded_FINAL_data_NEW[3].iloc[:, 0]
yy1 = NIR_new_loaded_FINAL_data_NEW[3].iloc[:, 1]
xx2 = NIR_new_loaded_FINAL_data_NEW[2].iloc[:, 0]
yy2 = NIR_new_loaded_FINAL_data_NEW[2].iloc[:, 1]
xx3 = NIR_new_loaded_FINAL_data_NEW[-5].iloc[:, 0]
yy3 = NIR_new_loaded_FINAL_data_NEW[-5].iloc[:, 1]
plt.figure(figsize=(15,8))
plt.plot(xx1.values, yy1.values, color='green')
plt.plot(xx2.values, yy2.values, color='red')
plt.plot(xx3.values, yy3.values, color='black')
plt.title("Data ready for POWER matrices, NIR", fontsize=20, weight=0)
plt.ylabel("Flux", fontsize=18, weight=0)
plt.xlabel("Angstrom", fontsize=18, weight=0)
plt.legend([xx1.index.names[0], xx2.index.names[0], xx3.index.names[0]], loc="upper right", fontsize=15)
plt.xticks(fontsize=14, weight=0)
plt.yticks(fontsize=14, weight=0)
plt.autoscale(enable=True, axis="x", tight=True)
plt.show()
VIS_new_final_file_names_NEW, VIS_new_final_file_paths_NEW = get_final_fileInfo122("VIS")
#VIS_new_final_file_names_NEW
pathsVIS = VIS_new_final_file_paths_NEW
namesVIS = VIS_new_final_file_names_NEW
VIS_new_loaded_FINAL_data_NEW, VIS_new_loaded_final_NAMES_NEW = load_FINAL_data122(pathsVIS, namesVIS)
print("This many VIS files were loaded: {0}".format(len(VIS_new_loaded_FINAL_data_NEW)))
for i in range(len(VIS_new_final_file_names_NEW)):
original_data = get_btsettl_data_byName(VIS_new_final_file_names_NEW[i])
print(original_data.iloc[:1,:])
xx_orig = original_data.iloc[100000:350000, 0].values
yy_orig = original_data.iloc[100000:350000, 1].values
plt.figure(figsize=(15,8))
plt.plot(xx_orig, yy_orig)
plt.title("Original Un-processed Data, VIS", fontsize=20, weight=0)
plt.xlim(xx_orig[0], xx_orig[-1])
plt.autoscale(enable=True, axis="x", tight=True)
loc = matplotlib.ticker.LinearLocator(numticks=5)
plt.xticks(fontsize=15, weight=0)
plt.yticks(fontsize=15, weight=0)
plt.gca().xaxis.set_major_locator(loc)
plt.show()
print(VIS_new_loaded_FINAL_data_NEW[i].iloc[:1,:])
xx = VIS_new_loaded_FINAL_data_NEW[i].iloc[:, 0].values
yy = VIS_new_loaded_FINAL_data_NEW[i].iloc[:, 1].values
plt.figure(figsize=(15,8))
plt.plot(xx, yy)
plt.title("Pre-processed Data, VIS", fontsize=20, weight=0)
plt.xticks(fontsize=15, weight=0)
plt.yticks(fontsize=15, weight=0)
plt.autoscale(enable=True, axis="x", tight=True)
plt.show()
print("=============================================================================================================")
print("=============================================== STARTING NEW ================================================")
print("=============================================================================================================")
print()
print()
print()
x1 = VIS_new_loaded_FINAL_data_NEW[1].iloc[:, 0]
y1 = VIS_new_loaded_FINAL_data_NEW[1].iloc[:, 1]
x2 = VIS_new_loaded_FINAL_data_NEW[4].iloc[:, 0]
y2 = VIS_new_loaded_FINAL_data_NEW[4].iloc[:, 1]
x3 = VIS_new_loaded_FINAL_data_NEW[6].iloc[:, 0]
y3 = VIS_new_loaded_FINAL_data_NEW[6].iloc[:, 1]
plt.figure(figsize=(15,8))
plt.plot(x1.values, y1.values, color='green')
plt.plot(x2.values, y2.values, color='red')
plt.plot(x3.values, y3.values, color='black')
plt.title("Data ready for POWER matrices, VIS", fontsize=20, weight=0)
#plt.title("Final Pre-processed data, VIS", fontsize=20, weight=0)
plt.ylabel("Flux", fontsize=18, weight=0)
plt.xlabel("Angstrom", fontsize=18, weight=0)
plt.legend([x1.index.names[0], x2.index.names[0], x3.index.names[0]], loc="upper right", fontsize=15)
plt.xticks(fontsize=14, weight=0)
plt.yticks(fontsize=14, weight=0)
plt.autoscale(enable=True, axis="x", tight=True)
plt.show()
def get_btsettl_READY_Info(NIR_or_VIS):
""" Returns the file paths and names for the pre-processed bt-settl data.
'NIR_or_VIS' is a string, either 'NIR' or 'VIS'. """
if NIR_or_VIS.upper() == "NIR":
tp = "NIR/"
elif NIR_or_VIS.upper() == "VIS":
tp = "VIS/"
else:
print('The input is not correct. Enter "VIS" or "NIR" as strings. ')
# Get the directory for each .dat file and store them in file_paths
# Choose directory depending on the OS I'm running
if "Darwin" in os.uname():
directory = "/Users/jakob/Desktop/Master Project/bt-settl (new version)/" + tp
else:
directory = "/home/salomonsson/Master Project/bt-settl (new version)/" + tp
#directory = "/home/salomonsson/Master Project/bt-settl-CLEAN/" + tp
btsettl_READY_paths = glob(directory + "*.dat")
# Get the .dat file names
btsettl_READY_names = []
for root, dirs, files in os.walk(directory):
for filename in files:
if filename.endswith(".dat"):
filename = filename[:-4] # Remove unneccessary "common" information from the name
btsettl_READY_names.append(filename)
return btsettl_READY_names, btsettl_READY_paths
btsettl_READY_names_NIR, btsettl_READY_paths_NIR = get_btsettl_READY_Info("NIR")
btsettl_READY_names_VIS, btsettl_READY_paths_VIS = get_btsettl_READY_Info("VIS")
# This function needed to be rewritten and changed even though it does the same thing on very similar input data as the
# "load_all_data" function used in the very beginning of this document. I havent found a reason for why finding all
# the "#" didnt worked as before.
def load_btsettl_READY_data(name):
""" Returns the btsettl dataframe with the name "name". """
if "NIR" in name:
paths = btsettl_READY_paths_NIR
elif "VIS" in name:
paths = btsettl_READY_paths_VIS
else:
print('Input is incorrect. Enter a file name.')
for path in paths:
if name in path:
data = pd.read_table(path, sep=",", header=None, skiprows=3)
data.index.names = [name] # Add the file name to the dataframe
#limits = (data[data.values == '#']) # Find all the indeces with a "#" (Not working!)
limits = {'# order: 0': 0}
for i in range(len(data)):
if "#" in data.iloc[i,0]:
limits[data.iloc[i,0]] = data.index[i]
keys = [key for key, value in limits.items()]
values = [int(value) for key, value in limits.items()]
data_pieces, count = [], 0
for i in range(len(limits)):
start = values[count]
if count == (len(limits)-1):
d_piece = data.iloc[start:,:] # The last data piece contains the last order in the df data
else:
d_piece = data.iloc[start:values[count+1],:] # Select the correct data piece
d_piece.index.names = [str(d_piece.index.names)[2:-2] + "_Order_" + str(count)] # add order to name
d_piece.columns = ["Angstrom", "Flux"]
d_piece = d_piece.dropna(inplace=False) # Remove NaN-values
#data_piece.dropna(inplace=True) # Not like this to avoid a "SettingWithCopyWarning" error
data_pieces.append(d_piece)
count += 1
return data_pieces
test_data = load_btsettl_READY_data(btsettl_READY_names_VIS[0])
btsettl_READY_names_VIS[0]
Check wheather all the orders exists in both the NIR and VIS files (its easier to check this in the carmenes data). If the orders dont match, remove those that are not common for all the data
# Check whether all the orders exists in both the NIR and VIS files. (Its easier to check this in the carmenes data)
def check_orders(data):
""" data is a list with dataframes. Returns True if orders are okay, else False."""
first = len(data[0])
for d in range(1, len(data)):
if len(data[d]) == len(data[0]):
break
else:
print("Number of orders is not the same in all dataframes")
return False
print("Number of orders is equal")
return True
# I skip this for now as the data is OK.
def remove_non_common_orders(data):
""" data is a list with dataframes.
If there are any orders that arent common for all dataframes, remove them. """
# First, check how many orders there should be
how_many = []
for i in range(len(data)):
how_many.append(len(data[i]))
should_be = max(how_many)
all_bad_lengths = []
for l in range(len(data)):
if len(data[l]) != should_be: # if length is not as it should be
this_length = []
for ll in range(len(data[l])):
order = str(data[l][ll].index.names)[34:-2] # get the order number
this_length.append(order)
all_bad_lengths.append(this_length)
print(all_bad_lengths)
# wavelets documentation
# http://pycwt.readthedocs.io/en/latest/tutorial.html#time-series-spectral-analysis-using-wavelets
def get_POWER_matrix(data, rescale=True):
""" Returns the power matrix, the scale indices and the Fourier frequencies.
Set 'rescale' to False if rescaling is not desired. """
t0, dt, N = 0, 0.5, data.size
t = np.arange(0, N) * dt*2 + t0
p = np.polyfit(t - t0, data, 1)
dat_notrend = data - np.polyval(p, t - t0)
std = dat_notrend.std() # Standard deviation
var = std ** 2 # Variance
dat_norm = dat_notrend / std # Normalised dataset
mother = wavelet.Morlet(6) # Morlet transformation function
# The following routines perform the wavelet transformation using the parameters defined above.
wave, scales, freqs, coi, fft, fftfreqs = wavelet.cwt(dat_norm, dt, J=-1)
power = (np.abs(wave)) ** 2
if rescale == True:
power *= 255.0/power.max() # rescale 0-255
power = power.astype('h') # dtype = short
scales *= 255.0/scales.max()
scales = scales.astype('h')
freqs *= 255.0/freqs.max()
freqs = freqs.astype('h')
return power, scales, freqs
def get_FULL_POWER_matrix(data):
""" Returns the power matrix, the scale indices and the Fourier frequencies of "data".
All orders included.
"""
power_matrix, full_scales, full_freqs = np.array([1]), np.array([1]), np.array([1])
for i in range(len(data)):
power, scales, freqs = get_POWER_matrix(data[i].loc[:, "Flux"])
if len(power_matrix) > 10: # if no power matrix is created
power_matrix = np.append(power_matrix, power, axis=0)
full_scales = np.append(full_scales, scales, axis=0)
full_freqs = np.append(full_freqs, freqs, axis=0)
else:
power_matrix = np.array(power)
full_scales = np.array(scales)
full_freqs = np.array(freqs)
return power_matrix, full_scales, full_freqs
def plot_data(data, rescale=True):
""" Plot the data for a single order. data is a one column array. """
t0, dt, N = 0, 0.5, data.size
t = np.arange(0, N) * dt*2 + t0
p = np.polyfit(t - t0, data, 1)
dat_notrend = data - np.polyval(p, t - t0)
std = dat_notrend.std() # Standard deviation
var = std ** 2 # Variance
dat_norm = dat_notrend / std # Normalised dataset
mother = wavelet.Morlet(6) # Morlet itransformation function
s0 = 2 * dt # Starting scale, in this case 2 * 0.25 years = 6 months
dj = 1 / 48 # Twelve sub-octaves per octaves
J = -1
alpha, _, _ = wavelet.ar1(data) # Lag-1 autocorrelation for red noise
# The following routines perform the wavelet transformation using the parameters defined above.
wave, scales, freqs, coi, fft, fftfreqs = wavelet.cwt(dat_norm, dt, J=-1)
power = (np.abs(wave)) ** 2
if rescale == True:
power *= 255.0/power.max() # rescale 0-255
power = power.astype('h') # dtype = short
period = 1 / freqs
signif, fft_theor = wavelet.significance(1.0, dt, scales, 0,
alpha, significance_level=0.95, wavelet=mother)
sig95 = np.ones([1, N]) * signif[:, None]
sig95 = power / sig95
# Finally, we plot our results.
plt.close('all')
plt.ioff()
fig = plt.figure(figsize=(12.5, 40), dpi=80)
# Second sub-plot, the normalised wavelet power spectrum and significance level contour
# lines and cone of influece hatched area. Note that period scale is logarithmic.
bx = plt.axes([0.1, 0.37, 0.65, 0.28])
levels = [0.0625, 0.125, 0.25, 0.5, 1, 2, 4, 8, 16]
bx.contourf(t, np.log2(period), np.log2(power), np.log2(levels),
extend='both', cmap=plt.cm.viridis)
extent = [t.min(), t.max(), 0, max(period)]
bx.contour(t, np.log2(period), sig95, [-99, 1], colors='k', linewidths=1, extent=extent)
bx.set_ylabel('Period', fontsize=18)
bx.set_xlabel('Index', fontsize=18)
plt.autoscale(enable=True, axis="x", tight=True)
Yticks = 2 ** np.arange(np.ceil(np.log2(period.min())), np.ceil(np.log2(period.max())))
bx.set_yticks(np.log2(Yticks))
bx.set_yticklabels(Yticks, fontsize=14)
plt.xticks(fontsize=14)
plt.title("POWER matrix (Spectrogram) for " + str(data.index.names)[5:-2], fontsize=18, weight=0)
plt.show()
# Make sure the equal amount of orders exists in all NIR and VIS files respectively.
NIR_OK, VIS_OK = "NO", "NO"
if check_orders(all_loaded_data_NIR) == True:
NIR_OK = "YES"
if check_orders(all_loaded_data_VIS) == True:
VIS_OK = "YES"
test_data[3].loc[:, "Flux"].shape
# Plot the power matrices if the orders are equal in the files.
if VIS_OK == "YES" and NIR_OK == "YES":
for i in range(5):
#print("POWER matrix (Spectrogram) for " + str(test_data[i+1].loc[:, "Flux"].index.names)[5:-2])
plot_data(test_data[i+1].loc[:, "Flux"], rescale=False)
def create_MATRIX_path(name, NIR_or_VIS):
"""Create file path to store files. 'name' and 'NIR_or_VIS' are strings."""
if NIR_or_VIS.upper() == "NIR":
tp = "NIR/"
elif NIR_or_VIS.upper() == "VIS":
tp = "VIS/"
else:
print('Please enter "NIS" or "VIR" as second input parameter.')
if "Darwin" in os.uname():
base = "/Users/jakob/Desktop/Master Project/POWER_matrices/" + tp
else:
base = "/home/salomonsson/drive/POWER_matrices/" + tp
return(base + name + ".csv")
### Each NIR file takes some 20 seconds to process. 335 min in total ###
btsettl_READY_names_NIR, btsettl_READY_paths_NIR = get_btsettl_READY_Info("NIR")
if "Darwin" in os.uname():
d = "/Users/jakob/Desktop/Master Project/POWER_matrices/NIR/"
else:
d = "/home/salomonsson/drive/POWER_matrices/NIR/"
nir_files = []
for dirpath, dirnames, files in os.walk(d): # Get the amount of NIR files in the directory d
for f in files:
if "NIR" in f:
nir_files.append(f)
if len(nir_files) < (len(btsettl_READY_names_NIR)): # if all the VIS files havent been "converted" to power matrices
start1 = time.time()
if NIR_OK == "YES":
for i in range(len(btsettl_READY_names_NIR)): #len(btsettl_READY_names_NIR)
NIR_name = btsettl_READY_names_NIR[i]
NIR_d = load_btsettl_READY_data(NIR_name)
NIR_m = get_FULL_POWER_matrix(NIR_d)[0] # Get the Flux's power matrix
NIR_path = create_MATRIX_path(NIR_name, 'NIR')
pd.DataFrame(NIR_m, dtype='h').to_csv(NIR_path, header=True, index=False, mode="w") # dtype = short
# Check how long time it took.
end1 = time.time()
tt1 = end1 - start1
print("Total run time: {0:0.0f} min and {1:3.0f} s ".format((tt1-tt1%60)/60, tt1%60))
else:
print("All the power matrices for the pre-processed NIR files are already in the directory")
### Each VIS file takes some 30 seconds to process. ###
btsettl_READY_names_VIS, btsettl_READY_paths_VIS = get_btsettl_READY_Info("VIS")
if "Darwin" in os.uname():
d = "/Users/jakob/Desktop/Master Project/POWER_matrices/VIS/"
else:
d = "/home/salomonsson/drive/POWER_matrices/VIS/"
#d = "/home/salomonsson/drive/POWER_matrices_CLEAN/VIS/"
vis_files = []
for dirpath, dirnames, files in os.walk(d): # Get the amount of VIS files in the directory d
for f in files:
if "VIS" in f:
vis_files.append(f)
if len(vis_files) < len(btsettl_READY_names_VIS): # if all the VIS files havent been "converted" to power matrices
start1 = time.time()
if VIS_OK == "YES":
for i in range(len(btsettl_READY_names_VIS)):
VIS_name = btsettl_READY_names_VIS[i]
VIS_d = load_btsettl_READY_data(VIS_name)
VIS_m = get_FULL_POWER_matrix(VIS_d)[0]
VIS_path = create_MATRIX_path(VIS_name, 'VIS')
pd.DataFrame(VIS_m, dtype='h').to_csv(VIS_path, header=True, index=False, mode="w") # dtype = short
# Check how long time it took.
end1 = time.time()
tt1 = end1 - start1
print("Total run time: {0:0.0f} min and {1:3.0f} s ".format((tt1-tt1%60)/60, tt1%60))
else:
print("All the power matrices for the pre-processed VIS files are already in the directory")
# https://github.com/navoshta/behavioral-cloning/blob/master/model.py
def CNN_model_for_FULL(input_shape):
""" Create the model architecture. """
model = Sequential()
model.add(Conv2D(16, (7, 7), input_shape=input_shape, activation='relu'))
model.add(pooling.MaxPooling2D(pool_size=(2, 2)))
model.add(Conv2D(32, (7, 7), activation='relu'))
model.add(pooling.MaxPooling2D(pool_size=(2, 2)))
model.add(Conv2D(64, (7, 7), activation='relu'))
model.add(pooling.MaxPooling2D(pool_size=(2, 2)))
##############
model.add(Conv2D(128, (5, 5), activation='relu'))
model.add(pooling.MaxPooling2D(pool_size=(2, 2)))
model.add(Conv2D(256, (5, 5), activation='relu'))
model.add(pooling.MaxPooling2D(pool_size=(2, 2)))
model.add(Conv2D(256, (5, 5), activation='relu'))
model.add(pooling.MaxPooling2D(pool_size=(2, 2)))
model.add(Conv2D(512, (3, 3), activation='relu'))
model.add(pooling.MaxPooling2D(pool_size=(2, 2)))
model.add(Conv2D(512, (3, 3), activation='relu'))
model.add(pooling.MaxPooling2D(pool_size=(2, 2)))
model.add(Conv2D(1024, (3, 3), activation='relu'))
model.add(pooling.MaxPooling2D(pool_size=(2, 2)))
model.add(Conv2D(1024, (3, 3), activation='relu'))
model.add(pooling.MaxPooling2D(pool_size=(2, 2)))
###############
model.add(Flatten())
# model.add(Dense(500, activation='relu'))
model.add(Dense(100, activation='relu'))
model.add(Dense(20, activation='relu'))
# As we're building a regressor, the last activation function needs to be linear
# https://towardsdatascience.com/activation-functions-and-its-types-which-is-better-a9a5310cc8f
model.add(Dense(3, activation="linear")) # We want to determine 3 different parameters
#model.compile(optimizer=optimizers.Adam(lr=1e-04), loss='mean_squared_error')
# Important to note: for regression cases, one needs to use mse or mae as loss function.
# You can't use softmax as activation (since the output of the model isn't supposed to be probability)!
# Ett annat alternativ ar att ta bort te tva sista Conv layers samt byta Dense 100 mot Dense 500.
model.summary()
return model
def CNN_model_FULL(input_shape):
""" Create the model architecture. """
model = Sequential()
model.add(Conv2D(16, (7, 7), input_shape=input_shape, activation='relu'))
model.add(pooling.MaxPooling2D(pool_size=(2, 2)))
model.add(Conv2D(32, (7, 7), activation='relu'))
model.add(pooling.MaxPooling2D(pool_size=(2, 2)))
model.add(Conv2D(64, (7, 7), activation='relu'))
model.add(pooling.MaxPooling2D(pool_size=(2, 2)))
model.add(Conv2D(128, (5, 5), activation='relu'))
model.add(pooling.MaxPooling2D(pool_size=(2, 2)))
model.add(Conv2D(256, (5, 5), activation='relu'))
model.add(pooling.MaxPooling2D(pool_size=(2, 2)))
model.add(Conv2D(512, (5, 5), activation='relu'))
model.add(pooling.MaxPooling2D(pool_size=(2, 2)))
model.add(Conv2D(1024, (3, 3), activation='relu'))
model.add(pooling.MaxPooling2D(pool_size=(2, 2)))
model.add(Conv2D(1024, (3, 3), activation='relu'))
model.add(pooling.MaxPooling2D(pool_size=(2, 2)))
model.add(Flatten())
model.add(Dense(50, activation='relu'))
model.add(Dense(20, activation='relu'))
model.add(Dense(3, activation="linear")) # We want to determine 3 different parameters
model.summary()
return model
model.add(Reshape((x, y))): If the output of the MP layer is [None, 10, 10], then the reshape layer must create a shape that is exactly 10*10=100 elements; a valid shape would be (100,) or (25, 4). Reshaping only reshapes the dimensions of the data, it does not add/remove extra data to guarantee the asked for output shape.
The shape (None, 2) simply means that the network accepts a batch with any number of elements that themselves are arrays of two elements (The first element is batch size in Keras shapes). So the correct input data shape would be (2,).
def substring_before(s, delim):
""" Divide the string 's' at 'delim' and return both what's before and after. """
before = s.partition(delim)[0]
rest = s.partition(delim)
return before, rest
def get_MATRIX_paths(NIR_or_VIS):
""" Return all the matrix paths for 'NIR' or 'VIS'. """
if NIR_or_VIS.upper() == "NIR":
tp = "NIR/"
elif NIR_or_VIS.upper() == "VIS":
tp = "VIS/"
else:
print('Please enter "NIS" or "VIR".')
if "Darwin" in os.uname():
directory = "/Users/jakob/Desktop/Master Project/POWER_matrices/" + tp
else:
directory = "/home/salomonsson/drive/POWER_matrices/" + tp
return glob(directory + "*.csv")
def get_Matrix_and_Parameters(path):
""" Returns the matrix stored in 'path', with the corresponding parameters
(Temperature, Surface Gravity and Metallicity)
"""
matrix = pd.read_csv(path) # load the power matrix
parameters = []
filename = os.path.basename(path)[:-4]
name = filename
# Process the file name and extract the individual parameters. Store them in a list.
#####
# Remove the alpha parameter and do some editing to the file name
for l in str(filename):
if 'a' in l: # if 'a' is in the file name
d = 'a'
break
else:
d = '_'
filename = substring_before(filename[3:], d)[0]
# Extract the Temperature
for l in str(filename[:6]):
if '-' in l:
d1 = '-'
break
elif '+' in l:
d1 = '+'
break
else:
pass
p, rest = substring_before(filename, d1)
parameters.append(float(p))
# Extract the Surface gravity, Log(G)
for l in str(rest[2][:5]):
if '-' in l:
de = '-'
break
elif '+' in l:
de = '+'
break
else:
pass
p1, rest1 = substring_before(rest[2], de)
parameters.append(float(d1 + p1))
# Extract the Metallicity
for l in str(rest[2]):
if '-' in l:
de1 = '-'
break
elif '+' in l:
de1 = '+'
break
else:
pass
p2, rest2 = substring_before(rest[2], de1)
parameters.append(float(rest2[1] + rest2[2]))
#####
#print(parameters)
return matrix.values, np.array(parameters), name
The training goes both faster and yield better results when scaled.
# Functions for scaling and rescaling the parameters.
def MMscale_param(param, name):
""" A function for scaling the parameters in param. name is the matrix name from where the parameters origin. """
if "NIR" in name:
scalers = many_MinMaxScalers_NIR_param
elif "VIS" in name:
scalers = many_MinMaxScalers_VIS_param
else:
print('Have you entered a valid name?')
scaled_param = scalers[name].fit_transform(param.reshape(-1, 1))
return scaled_param
def Un_scale_data(scaled_param, name):
""" A function for unscaling the parameters in the variable scaled_param. """
if "NIR" in name:
scalers = many_MinMaxScalers_NIR_param
elif "VIS" in name:
scalers = many_MinMaxScalers_VIS_param
else:
print('Have you entered a valid name?')
un_scaled_param = scalers[name].inverse_transform(scaled_param.reshape(-1,1))
return un_scaled_param
def normalise(array):
""" Normalise array to [0, 1]. Decrease the precision to float32 to save memory. """
return (255.0 - array.astype("float32")) / 255.0
# The difference in using float32 and float16 is less then 0.5% when using the full sized matriz. Test float16 as well!
Specify one scaler for each matrix data. Name it accordingly to the data matrix's name
NIR_paths = get_MATRIX_paths("NIR")
VIS_paths = get_MATRIX_paths("VIS")
many_MinMaxScalers_NIR_param, many_MinMaxScalers_VIS_param = {}, {}
for i_s in range(len(NIR_paths)):
name = os.path.basename(NIR_paths[i_s])[:-4]
many_MinMaxScalers_NIR_param["{0}".format(name)]=MinMaxScaler(feature_range=(0,1), copy=True)
# Set copy=False to perform inplace row normalisation and avoid a copy
for i_v in range(len(VIS_paths)):
name = os.path.basename(VIS_paths[i_v])[:-4]
many_MinMaxScalers_VIS_param["{0}".format(name)]=MinMaxScaler(feature_range=(0,1), copy=True)
Sequence are a safer way to do multiprocessing. This structure guarantees that the network will only train once on each sample per epoch which is not the case with generators. The Generator generates data as input to the algorithm as feeding it all the data at once isnt possible due to memory limitations.
### https://stackoverflow.com/questions/47200146/keras-load-images-batch-wise-for-large-dataset
### https://keras.io/utils/#sequence + modifications
class My_Sequence(Sequence):
""" Generates batches of training data and ground truth.
Inputs are the image paths and batch size.
"""
def __init__(self, image_paths, batch_size):
self.image_paths, self.batch_size = image_paths, batch_size
def __len__(self):
return int(np.ceil(len(self.image_paths) / float(self.batch_size)))
def __getitem__(self, idx):
batch = self.image_paths[idx * self.batch_size:(idx + 1) * self.batch_size]
matrices, parameters = [], []
for file_path in batch:
mat, param, name = get_Matrix_and_Parameters(file_path)
#Transform the matrix from 2D to 3D as a (mat.shape[0], mat.shape[1]) RBG image.
# Rescale its values to [0,1]. Set "preserve_range=True" to not
# rescale the matrix, and by this saving memory and load time.
mat = skimage.transform.resize(mat, (mat.shape[0]//down_size,
mat.shape[1]//down_size, 3),
mode='constant', preserve_range=True)
param = MMscale_param(param, name) # Rescale the parameters
mat = normalise(mat) # Rescale the matrix to [0, 1]
matrices.append(mat)
parameters.append(param)
MAT, PAM = np.array(matrices), np.array(parameters)
PAM = np.reshape(PAM, (PAM.shape[0], PAM.shape[1]))
gc.collect() # Garbage Collector
return MAT, PAM
def Data_Generator(image_paths, batch_size=16):
""" Generates batches of training data and ground truth. Inputs are the image paths and batch size. """
L = len(image_paths)
#this line is just to make the generator infinite, keras needs that
while True:
batch_start = 0
batch_end = batch_size
while batch_start < L:
matrices, parameters = [], []
# Select the batch
batch = image_paths[batch_start:batch_end]
limit = min(batch_end, L)
for path in batch:
# Get the matrix, parameters and the name
mat, param, name = get_Matrix_and_Parameters(path)
#Transform the matrix from 2D to 3D as a (mat.shape[0], mat.shape[1]) RBG image. Rescale to [0,1]
# Set "preserve_range=True" to not rescale the matrix, and by this saving memory and load time.
# Results from training might be worse though.
mat = skimage.transform.resize(mat, (mat.shape[0]//down_size, mat.shape[1]//down_size, 3),
mode='constant', preserve_range=True)
# Scale and normalise the parameters and the matrix
param = MMscale_param(param, name)
mat = normalise(mat)
matrices.append(mat)
parameters.append(param)
# Convert the lists into arrays and reshape the parameter array to (batch size, number of parameters)
MAT, PAM = np.array(matrices), np.array(parameters)
PAM = np.reshape(PAM, (PAM.shape[0], PAM.shape[1]))
# Output the matrix and parameters array. Delete the data from RAM after it's been used to save memory
yield MAT, PAM
batch_start += batch_size
batch_end += batch_size
gc.collect() # Garbage Collector
# Load and shuffle the paths randomly
matrix_paths_NIR = get_MATRIX_paths('NIR')
random.Random(32).shuffle(matrix_paths_NIR) # Shuffle the same way each time
training_paths_NIR = matrix_paths_NIR[:int(len(matrix_paths_NIR)*0.8)] # train on 80% of the data set
validation_paths_NIR = matrix_paths_NIR[int(len(matrix_paths_NIR)*0.8):int(len(matrix_paths_NIR)*0.9)] # validate on 10%
test_paths_NIR = matrix_paths_NIR[int(len(matrix_paths_NIR)*0.9):] # test on 10%
print("Number of training samples for NIR: ", len(training_paths_NIR))
print("Number of validation samples for NIR: ", len(validation_paths_NIR))
print("Number of test samples for NIR: ", len(test_paths_NIR))
# Copy the test files to another folder
#for i in range(60):
# shutil.copy(test_paths_NIR[i], "/home/salomonsson/Master Project/temporary/NIR/")
# Down size the matrix this much. down_size = 1 equals no scaling. However, this doesnt fit into the GPU memory!
down_size = 2
NIR_model = CNN_model_FULL(input_shape=(3724//down_size, 4073//down_size, 3))
# Divide the input shape by 8 and remove the last three layers (1024, 2048, 4096) in the CNN_model results
# in very few trainable parameters (8M, instead of 203). Try it!?
batch_size = 2 # (When running with larger batch size on GPU, a ResourceExhaustedError is thrown.)
my_training_batch_generator_NIR = Data_Generator(training_paths_NIR, batch_size) # training generator
my_validation_batch_generator_NIR = Data_Generator(validation_paths_NIR, batch_size) # validation generator
my_test_batch_generator_NIR = Data_Generator(test_paths_NIR, batch_size) # testing generator
NIR_model.compile(optimizer=optimizers.Adam(lr=0.0001), loss='mean_squared_error', metrics=['mse'])
# Important to note: for regression cases, one needs to use mse or mae as loss function.
Before that, create a checkpointer to store the best model.
When using the GPU for CNN training between different Jupyter notebooks, cleaning the GPU memory between the runs might be necessary. Do so by first calling sudo fuser -v /dev/nvidia* in a Terminal window before executing sudo kill -xxxxx PID, where xxxxx is the process' PID number.
def fxn():
""" Define a function to ignore a UserWarning related to precision loss when converting from int64 to float64."""
warnings.warn("Possible precision loss when converting from int64 to float64", UserWarning)
if "Darwin" in os.uname():
model_path = "/Users/jakob/Desktop/Master Project/Models/NIR_model.weights.best.hdf5"
log_path = "/Users/jakob/Desktop/Master Project/Models/NIR_training.log"
else:
model_path = "/home/salomonsson/Master Project/Models/NIR_model.weights.best.hdf5"
log_path = "/home/salomonsson/Master Project/Models/NIR_training.log"
# Load the model to continue the training
#NIR_model.load_weights(model_path)
start = time.time() # Time it
num_epochs = 100
with warnings.catch_warnings(): # Catch and ignore the warning
warnings.simplefilter("ignore")
fxn()
csv_logger = CSVLogger(log_path, separator=',', append=False) # Log the training
checkpointer = ModelCheckpoint(filepath=model_path,
verbose=1,
save_best_only=True) # Save the best model
architecture = TensorBoard(log_dir='./Graph/NIR', histogram_freq=0,
write_graph=True, write_images=True) # Save the CNN Architecture
history = NIR_model.fit_generator(generator=my_training_batch_generator_NIR,
steps_per_epoch=(len(validation_paths_NIR) // batch_size),
epochs=num_epochs,
verbose=1,
callbacks=[checkpointer, csv_logger, architecture],
validation_data=my_validation_batch_generator_NIR,
validation_steps=(len(validation_paths_NIR) // batch_size),
use_multiprocessing=True,
max_queue_size=3,
workers=10,
shuffle=True)
# Print total training time
end = time.time()
tt = end-start
print()
print("Total training time was {0:0.0f} min and {1:0.0f}s. ".format((tt-tt%60)/60, tt%60))
# max_queue_size : It specifies how many batches it’s going to prepare in the queue.
# It doesn’t mean you’ll have multiple generator instances, nor that the training starts only when the queue is filled.
# If your targets are one-hot encoded, use categorical_crossentropy. "fit_generator" one-hot encodes
# the input data automatically!
# But if your targets are integers, use sparse_categorical_crossentropy.
# Increasing the number of "workers" might result in better GPU Utilisation as there are more CPUs
# who feed the GPU with data.
#backend.clear_session()
# tensorboard --logdir=/home/salomonsson/Master\ Project/Graph/
architecture.set_model(VIS_model)
# Then run "tensorboard --logdir .Graph/VIS/" in Terminal
# (MSE and Loss is the same)
# Load the training history from file
log_data = pd.read_csv(log_path, sep=',', engine='python')
print(log_data.columns)
log_data = log_data.iloc[1:, :]
# Plot for the Error
plt.figure(figsize=(14,7))
plt.plot(log_data['loss'], color="black")
plt.plot(log_data['val_loss'], color="red")
plt.title("Mean Squared Error (NIR)", fontsize=20)
plt.ylabel("MSE", fontsize=17)
plt.xlabel("Epochs", fontsize=17)
plt.legend(["Training error: " + str(log_data.iloc[-1,-3])[:8],
"Validation error: " + str(log_data.iloc[-1,-1])[:8]],
loc="upper right", fontsize=15)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.autoscale(enable=True, axis="x", tight=True)
plt.show()
#print(history.history.keys())
#print(history.history['mean_squared_error'])
if "Darwin" in os.uname():
model_path = "/Users/jakob/Desktop/Master Project/Models/NIR_model.weights.best.hdf5"
pred_path = "/Users/jakob/Desktop/Master Project/Predictions/Some_preds_NIR.txt"
else:
model_path = "/home/salomonsson/Master Project/Models/NIR_model.weights.best.hdf5"
pred_path = "/home/salomonsson/Master Project/Predictions/Some_preds_NIR.txt"
NIR_model.load_weights(model_path)
np.set_printoptions(precision=3)
list_for_saving = []
print("Parameters are printed as [Temperature, Log(G), Metallicity]")
print("---------------------------------------------------------------------------------------------")
for path in range(50):
# Load the matrix, parameters and the name. Resize the matrix
mat, param, name = get_Matrix_and_Parameters(test_paths_NIR[path])
mat = skimage.transform.resize(mat, (mat.shape[0]//down_size, mat.shape[1]//down_size, 3),
mode='constant', preserve_range=True)
# Reshape the matrix to "[batch, 3D RGB image]"
mat = np.reshape(mat, (1, mat.shape[0], mat.shape[1], mat.shape[2]))
# Rescale the matrix to [0,1]
mat = normalise(mat)
# Fit the scaler on the parameters
scaled_param = MMscale_param(param, name)
# Use the model to predict the parameters.
predicted_parameters = NIR_model.predict(mat)
# Use the fitted scaler to unscale the parameters to their original interval
unscaled_pred_param = Un_scale_data(predicted_parameters, name)
# Calculate the difference in percent and store the values in a list
diff = (abs(unscaled_pred_param.reshape(1,-1)-param))/abs(unscaled_pred_param.reshape(1,-1))
# Prepare for saving to file
d1 = pd.DataFrame(unscaled_pred_param.reshape(1,-1))
d2 = pd.DataFrame(param.reshape(1, -1))
d = pd.concat([d1, d2], axis=1)
list_for_saving.append(d)
# Print the results (transpose the unscaled predicted parameters for readability).
print("Predicted: {0}, Real values: {1}, DIFF: {2}".format(np.transpose(unscaled_pred_param)[0], param, diff))
print()
# Save to file
d2 = pd.concat(list_for_saving, axis=0)
columns = ["Pred. Temperature", "Pred Log(g)", "Pred Metallicity",
"True Temperature", "True Log(g)", "True Metallicity"]
d2.columns = columns
d2.to_csv(pred_path, header=True, index=False, mode="w")
print()
print("End")
# Load the data first
loaded_predictions_NIR = pd.read_csv(pred_path, sep=',', engine='python')
def autolabel(rects, color='black', extra_height=0):
"""
Attach a text label above each bar displaying its height
"""
for rect in rects:
height = rect.get_height()
height1 = height + extra_height
ax.text(rect.get_x() + rect.get_width()/2., 1.005*height1,
'{0:2.0f}'.format(float(height)),
ha='center', va='bottom', color=color)
def autolabel_2(rects, color='black', extra_height=0):
"""
Attach a text label above each bar displaying its height
"""
for rect in rects:
height = rect.get_height()
height1 = height + extra_height
ax.text(rect.get_x() + rect.get_width()/2., 1.005*height1,
'{0:2.1f}'.format(float(height)),
ha='center', va='bottom', color=color)
# data to plot
n_groups = 10
pred_temp = loaded_predictions_NIR.loc[1:10, "Pred. Temperature"] * 100
true_temp = loaded_predictions_NIR.loc[1:10, "True Temperature"] * 100
# create plot
fig, ax = plt.subplots(figsize=(7,5))
index = np.arange(n_groups)
bar_width = 0.15
opacity = 1
rects1 = plt.bar(index, pred_temp, bar_width,
alpha=opacity,
color='black',
label='Predicted Effective Temperature')
rects2 = plt.bar(index + bar_width, true_temp, bar_width,
alpha=opacity,
color='red',
label='True Temperature')
plt.xlabel('Test Run', fontsize=17)
plt.ylabel('Temperature (K)', fontsize=17)
plt.title('Eff. Temp. NIR, down x20', fontsize=20)
plt.xticks(index + bar_width, ('A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J'), fontsize=15)
plt.yticks(fontsize=15)
#plt.legend(loc="lower right", fontsize=15, framealpha=0.9)
plt.tight_layout()
plt.show()
# data to plot
n_groups = 10
#pred_log = np.power(10, -1*loaded_predictions_NIR.loc[1:10, "Pred Log(g)"])
#true_log = np.power(10, -1*loaded_predictions_NIR.loc[1:10, "True Log(g)"])
# Lets go for the normal ones
pred_log = -1*loaded_predictions_NIR.loc[1:10, "Pred Log(g)"]
true_log = -1*loaded_predictions_NIR.loc[1:10, "True Log(g)"]
# create plot
fig, ax = plt.subplots(figsize=(7,5))
index = np.arange(n_groups)
bar_width = 0.15
opacity = 1
rects1 = plt.bar(index, pred_log, bar_width,
alpha=opacity,
color='black',
label='Predicted Surface Gravity')
rects2 = plt.bar(index + bar_width, true_log, bar_width,
alpha=opacity,
color='red',
label='True Surface Gravity')
plt.xlabel('Test Run', fontsize=17)
plt.ylabel('Surface Gravity', fontsize=17)
plt.title('Surface Gravity NIR, down x20', fontsize=20)
plt.xticks(index + bar_width, ('A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J'), fontsize=15)
plt.yticks(fontsize=15)
#plt.legend(loc="upper right", fontsize=15, framealpha=0.9)
#autolabel_2(rects1, extra_height=-.15)
#autolabel_2(rects2, 'red', extra_height=-.15)
plt.tight_layout()
plt.show()
# data to plot
n_groups = 10
pred_met = loaded_predictions_NIR.loc[1:10, "Pred Metallicity"]
true_met = loaded_predictions_NIR.loc[1:10, "True Metallicity"]
# create plot
fig, ax = plt.subplots(figsize=(7,5))
index = np.arange(n_groups)
bar_width = 0.15
opacity = 1
rects1 = plt.bar(index, pred_met, bar_width,
alpha=opacity,
color='black',
label='Predicted Metallicity')
rects2 = plt.bar(index + bar_width, true_met, bar_width,
alpha=opacity,
color='red',
label='True Metallicity')
plt.xlabel('Test Run', fontsize=17)
plt.ylabel('Metallicity', fontsize=17)
plt.title('Metallicity NIR, down x20', fontsize=20)
plt.xticks(index + bar_width, ('A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J'), fontsize=15)
plt.yticks(fontsize=15)
#plt.legend(loc="lower right", fontsize=15, framealpha=0.9)
#autolabel_2(rects1, extra_height=-.15)
#autolabel_2(rects2, 'red', extra_height=-.15)
plt.tight_layout()
plt.show()
loss_and_metrics = NIR_model.evaluate_generator(my_test_batch_generator_NIR,
steps=60,
max_queue_size=1,
workers=5,
use_multiprocessing=True,
verbose=1)
print(NIR_model.metrics_names)
print(loss_and_metrics)
# Load the data first
loaded_predictions_NIR = pd.read_csv(pred_path, sep=',', engine='python')
# Remove 0-values
loaded_predictions_NIR = loaded_predictions_NIR[loaded_predictions_NIR.loc[:, "True Log(g)"] != 0.0]
loaded_predictions_NIR = loaded_predictions_NIR[loaded_predictions_NIR.loc[:, "True Metallicity"] != 0.0]
# Calculate differences between the predicted and true values
loaded_predictions_NIR.loc[:, "Diff Temp (%)"] = (loaded_predictions_NIR.loc[:, "Pred. Temperature"]\
-loaded_predictions_NIR.loc[:, "True Temperature"])/loaded_predictions_NIR.loc[:, "Pred. Temperature"]
loaded_predictions_NIR.loc[:, "Diff Log(g) (%)"] = (loaded_predictions_NIR.loc[:, "Pred Log(g)"]\
-loaded_predictions_NIR.loc[:, "True Log(g)"])/loaded_predictions_NIR.loc[:, "Pred Log(g)"]
loaded_predictions_NIR.loc[:, "Diff Met (%)"] = (loaded_predictions_NIR.loc[:, "Pred Metallicity"]\
-loaded_predictions_NIR.loc[:, "True Metallicity"])/loaded_predictions_NIR.loc[:, "Pred Metallicity"]
diff_log_NIR = copy.deepcopy(loaded_predictions_NIR)
# Keep only the data within +2 to -2 of standard deviations.
diff_log_NIR = diff_log_NIR[(np.abs(diff_log_NIR["Diff Temp (%)"]-diff_log_NIR["Diff Temp (%)"].mean())\
<= (2*diff_log_NIR["Diff Temp (%)"].std()))]
diff_log_NIR = diff_log_NIR[(np.abs(diff_log_NIR["Diff Log(g) (%)"]-diff_log_NIR["Diff Log(g) (%)"].mean())\
<= (2*diff_log_NIR["Diff Log(g) (%)"].std()))]
diff_log_NIR = diff_log_NIR[(np.abs(diff_log_NIR["Diff Met (%)"]-diff_log_NIR["Diff Met (%)"].mean())\
<= (2*diff_log_NIR["Diff Met (%)"].std()))]
display(diff_log_NIR.head(20))
print("NIR")
print("=================")
print()
# Print the Mean error
print("MSE for Temperature: {0:2.6f}".format(np.power(diff_log_NIR["Diff Temp (%)"].mean(), 2)))
print("MSE for Gravity: {0:2.6f}".format(np.power(diff_log_NIR["Diff Log(g) (%)"].mean(), 2)))
print("MSE for Metallicity: {0:2.6f}".format(np.power(diff_log_NIR["Diff Met (%)"].mean(), 2)))
print()
# Print the median error
print("Median error for Temperature: {0:2.6f}".format(np.abs(diff_log_NIR["Diff Temp (%)"].median())))
print("Median error for Gravity: {0:2.6f}".format(np.abs(diff_log_NIR["Diff Log(g) (%)"].median())))
print("Median error for Metallicity: {0:2.6f}".format(np.abs(diff_log_NIR["Diff Met (%)"].median())))
# Clear the session as a preperation for the next one.
backend.clear_session()
# Load and shuffle the paths randomly
matrix_paths_VIS = get_MATRIX_paths('VIS')
random.Random(32).shuffle(matrix_paths_VIS) # Shuffle the same way each time (use 32)
training_paths_VIS = matrix_paths_VIS[:int(len(matrix_paths_VIS)*0.8)] # train on 80% of the data set
validation_paths_VIS = matrix_paths_VIS[int(len(matrix_paths_VIS)*0.8):int(len(matrix_paths_VIS)*0.9)] #validate on 10%
test_paths_VIS = matrix_paths_VIS[int(len(matrix_paths_VIS)*0.9):] # test on 10%
print("Number of training samples for NIR: ", len(training_paths_VIS))
print("Number of validation samples for NIR: ", len(validation_paths_VIS))
print("Number of test samples for NIR: ", len(test_paths_VIS))
## Copy the test files to another folder
#folderPath = "/home/salomonsson/Master Project/temporary/VIS/" #to get the path of the folder
#for i in range(60):
# shutil.copy(test_paths_VIS[i], folderPath)
7747//down_size
# Down size the matrix this much. down_size = 1 equals no scaling. However, this doesnt fit into the GPU memory!
down_size = 3 # 2 doesnt work for GPU memory reasons
VIS_model = CNN_model_FULL(input_shape=(7747//down_size, 2871//down_size, 3))
# Divide the input shape by 8 and remove the last three layers (1024, 2048, 4096) in the CNN_model results
# in very few trainable parameters (8M, instead of 203). Try it!?
batch_size = 2 # (When running with larger batch size on GPU, a ResourceExhaustedError is thrown.)
my_training_batch_generator_VIS = Data_Generator(training_paths_VIS, batch_size) # training generator
my_validation_batch_generator_VIS = Data_Generator(validation_paths_VIS, batch_size) # validation generator
my_test_batch_generator_VIS = Data_Generator(test_paths_VIS, batch_size) # testing generator
VIS_model.compile(optimizer=optimizers.Adam(lr=0.0001), loss='mean_squared_error', metrics=['mse'])
# Important to note: for regression cases, one needs to use mse or mae as loss function.
Before that, create a checkpointer to store the best model.
When using the GPU for CNN training between different Jupyter notebooks, cleaning the GPU memory between the runs might be necessary. Do so by first calling sudo fuser -v /dev/nvidia* in a Terminal window before executing sudo kill -xxxxx PID, where xxxxx is the process' PID number.
def fxn():
""" Define a function to ignore a UserWarning related to precision loss when converting from int64 to float64."""
warnings.warn("Possible precision loss when converting from int64 to float64", UserWarning)
if "Darwin" in os.uname():
model_path = "/Users/jakob/Desktop/Master Project/Models/VIS_model.weights.best.hdf5"
log_path = "/Users/jakob/Desktop/Master Project/Models/VIS_training.log"
else:
model_path = "/home/salomonsson/Master Project/Models/VIS_model.weights.best.hdf5"
log_path = "/home/salomonsson/Master Project/Models/VIS_150epochs_Matrix_DivOn3_batch2_Sequence.log"
# Load the model to continue the training
#VIS_model.load_weights(model_path)
start = time.time() # Time it
num_epochs = 100
with warnings.catch_warnings(): # Catch and ignore a warning
warnings.simplefilter("ignore")
fxn()
csv_logger = CSVLogger(log_path, separator=',', append=False) # Log the training
checkpointer = ModelCheckpoint(filepath=model_path,
verbose=1,
save_best_only=True) # Save the best model only
architecture = TensorBoard(log_dir='./Graph/VIS/', histogram_freq=0,
write_graph=True, write_images=True) # Save the CNN Architecture
history = VIS_model.fit_generator(generator=my_training_batch_generator_VIS,
steps_per_epoch=(len(validation_paths_VIS) // batch_size),
epochs=num_epochs,
verbose=1,
callbacks=[checkpointer, csv_logger, architecture],
validation_data=my_validation_batch_generator_VIS,
validation_steps=(len(validation_paths_VIS) // batch_size),
use_multiprocessing=True,
max_queue_size=2,
workers=10,
shuffle=True)#,
#initial_epoch=6) # start training from here
# Print total training time
end = time.time()
tt = end-start
print()
print("Total training time was {0:0.0f} min and {1:0.0f}s. ".format((tt-tt%60)/60, tt%60))
# tensorboard --logdir=/home/salomonsson/Master\ Project/Graph/
# (MSE and Loss is the same)
# Load the training history from file
log_data = pd.read_csv(log_path, sep=',', engine='python')
print(log_data.columns)
log_data = log_data.iloc[1:, :]
# Plot for the Error
plt.figure(figsize=(14,7))
plt.plot(log_data['loss'], color="black")
plt.plot(log_data['val_loss'], color="red")
plt.title("Mean Squared Error (VIS)", fontsize=20)
plt.ylabel("MSE", fontsize=17)
plt.xlabel("Epochs", fontsize=17)
plt.legend(["Training error: " + str(log_data.iloc[-1,-3])[:8],
"Validation error: " + str(log_data.iloc[-1,-1])[:8]],
loc="upper right", fontsize=15)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.autoscale(enable=True, axis="x", tight=True)
plt.show()
architecture.set_model(VIS_model)
# Then run "tensorboard --logdir .Graph/VIS/" in Terminal
if "Darwin" in os.uname():
model_path = "/Users/jakob/Desktop/Master Project/Models/VIS_model.weights.best.hdf5"
pred_path = "/Users/jakob/Desktop/Master Project/Predictions/Some_preds_VIS.txt"
else:
model_path = "/home/salomonsson/Master Project/Models/VIS_model.weights.best.hdf5"
pred_path = "/home/salomonsson/Master Project/Predictions/VIS_150epochs_Matrix_DivOn3_batch2_Sequence.txt"
#VIS_model.load_weights(model_path)
np.set_printoptions(precision=3)
list_for_saving = []
print("Parameters are printed as [Temperature, Log(G), Metallicity]")
print("---------------------------------------------------------------------------------------------")
for path in range(50):
# Load the matrix, parameters and the name. Resize the matrix
mat, param, name = get_Matrix_and_Parameters(test_paths_VIS[path])
mat = skimage.transform.resize(mat, (mat.shape[0]//down_size, mat.shape[1]//down_size, 3),
mode='constant', preserve_range=True)
# Reshape the matrix to "[batch, 3D RGB image]"
mat = np.reshape(mat, (1, mat.shape[0], mat.shape[1], mat.shape[2]))
# Rescale the matrix to [0,1]
mat = normalise(mat)
# Fit the scaler on the parameters
scaled_param = MMscale_param(param, name)
# Use the model to predict the parameters.
predicted_parameters = VIS_model.predict(mat)
# Use the fitted scaler to unscale the parameters to their original interval
unscaled_pred_param = Un_scale_data(predicted_parameters, name)
# Calculate the difference in percent and store the values in a list
diff = (abs(unscaled_pred_param.reshape(1,-1)-param))/abs(unscaled_pred_param.reshape(1,-1))
# Prepare for saving to file
d1 = pd.DataFrame(unscaled_pred_param.reshape(1,-1))
d2 = pd.DataFrame(param.reshape(1, -1))
d = pd.concat([d1, d2], axis=1)
list_for_saving.append(d)
# Print the results (transpose the unscaled predicted parameters for readability).
print("Predicted: {0}, Real values: {1}, DIFF: {2}".format(np.transpose(unscaled_pred_param)[0], param, diff))
print()
# Save to file
d2 = pd.concat(list_for_saving, axis=0)
columns = ["Pred. Temperature", "Pred Log(g)", "Pred Metallicity",
"True Temperature", "True Log(g)", "True Metallicity"]
d2.columns = columns
d2.to_csv(pred_path, header=True, index=False, mode="w")
print()
print("End")
# Load the data first
loaded_predictions_VIS = pd.read_csv(pred_path, sep=',', engine='python')
def autolabel(rects, color='black', extra_height=0):
"""
Attach a text label above each bar displaying its height
"""
for rect in rects:
height = rect.get_height()
height1 = height + extra_height
ax.text(rect.get_x() + rect.get_width()/2., 1.005*height1,
'{0:2.0f}'.format(float(height)),
ha='center', va='bottom', color=color)
def autolabel_2(rects, color='black', extra_height=0):
"""
Attach a text label above each bar displaying its height
"""
for rect in rects:
height = rect.get_height()
height1 = height + extra_height
ax.text(rect.get_x() + rect.get_width()/2., 1.005*height1,
'{0:2.1f}'.format(float(height)),
ha='center', va='bottom', color=color)
# data to plot
n_groups = 10
pred_temp = loaded_predictions_VIS.loc[[1, 2, 3, 4, 5, 6, 8, 9, 10, 11], "Pred. Temperature"] * 100
true_temp = loaded_predictions_VIS.loc[[1, 2, 3, 4, 5, 6, 8, 9, 10, 11], "True Temperature"] * 100
# create plot
fig, ax = plt.subplots(figsize=(14,7))
index = np.arange(n_groups)
bar_width = 0.25
opacity = 1
rects1 = plt.bar(index, pred_temp, bar_width,
alpha=opacity,
color='black',
label='Predicted Effective Temperature')
rects2 = plt.bar(index + bar_width, true_temp, bar_width,
alpha=opacity,
color='red',
label='True Temperature')
plt.xlabel('Test Run', fontsize=17)
plt.ylabel('Temperature (K)', fontsize=17)
plt.title('Effective Temperature for VIS', fontsize=20)
plt.xticks(index + bar_width, ('A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J'), fontsize=15)
plt.yticks(fontsize=15)
plt.legend(loc="lower right", fontsize=15, framealpha=0.9)
plt.tight_layout()
plt.show()
# data to plot
n_groups = 10
pred_log = np.power(10, -1*loaded_predictions_VIS.loc[[1, 2, 3, 4, 5, 6, 8, 9, 10, 11], "Pred Log(g)"])
true_log = np.power(10, -1*loaded_predictions_VIS.loc[[1, 2, 3, 4, 5, 6, 8, 9, 10, 11], "True Log(g)"])
# create plot
fig, ax = plt.subplots(figsize=(14,7))
index = np.arange(n_groups)
bar_width = 0.25
opacity = 1
rects1 = plt.bar(index, pred_log, bar_width,
alpha=opacity,
color='black',
label='Predicted Surface Gravity')
rects2 = plt.bar(index + bar_width, true_log, bar_width,
alpha=opacity,
color='red',
label='True Surface Gravity')
plt.xlabel('Test Run', fontsize=17)
plt.ylabel('Surface Gravity', fontsize=17)
plt.title('Surface Gravity for VIS', fontsize=20)
plt.xticks(index + bar_width, ('A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J'), fontsize=15)
plt.yticks(fontsize=15)
plt.legend(loc="upper right", fontsize=15, framealpha=0.9)
autolabel(rects1)
c, extra_height = 0, 3500
for rect in rects2:
if c == 2 or c == 7 or c == 9:
height = rect.get_height()
height1 = height + extra_height
ax.text(rect.get_x() + rect.get_width()/2., 1.005*height1,
'{0:2.0f}'.format(float(height)),
ha='center', va='bottom', color='red')
c += 1
# Dumb solution, but it works!
c = 0
for rect in rects2:
if c == 1 or c == 3 or c == 4 or c == 5 or c == 6 or c == 8 or c == 0:
height = rect.get_height()
ax.text(rect.get_x() + rect.get_width()/2., 1.005*height,
'{0:2.0f}'.format(float(height)),
ha='center', va='bottom', color='red')
c += 1
plt.tight_layout()
plt.show()
# data to plot
n_groups = 10
pred_met = loaded_predictions_VIS.loc[[1, 2, 3, 4, 5, 6, 8, 9, 10, 11], "Pred Metallicity"]
true_met = loaded_predictions_VIS.loc[[1, 2, 3, 4, 5, 6, 8, 9, 10, 11], "True Metallicity"]
# create plot
fig, ax = plt.subplots(figsize=(14,7))
index = np.arange(n_groups)
bar_width = 0.25
opacity = 1
rects1 = plt.bar(index, pred_met, bar_width,
alpha=opacity,
color='black',
label='Predicted Metallicity')
rects2 = plt.bar(index + bar_width, true_met, bar_width,
alpha=opacity,
color='red',
label='True Metallicity')
plt.xlabel('Test Run', fontsize=17)
plt.ylabel('Metallicity', fontsize=17)
plt.title('Metallicity for VIS', fontsize=20)
plt.xticks(index + bar_width, ('A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J'), fontsize=15)
plt.yticks(fontsize=15)
plt.legend(loc="lower left", fontsize=15, framealpha=0.9)
autolabel_2(rects1, extra_height=-.15)
autolabel_2(rects2, 'red', extra_height=-.15)
plt.tight_layout()
plt.show()
loss_and_metrics = VIS_model.evaluate_generator(my_test_batch_generator_VIS,
steps=100,
max_queue_size=1,
workers=4,
use_multiprocessing=True,
verbose=1)
print(VIS_model.metrics_names)
print(loss_and_metrics)
# Load the data first
loaded_predictionsVIS = pd.read_csv(pred_path, sep=',', engine='python')
# Remove 0-values as
loaded_predictionsVIS = loaded_predictionsVIS[loaded_predictionsVIS.loc[:, "True Log(g)"] != 0.0]
loaded_predictionsVIS = loaded_predictionsVIS[loaded_predictionsVIS.loc[:, "True Metallicity"] != 0.0]
# Calculate differences between the predicted and true values
loaded_predictionsVIS.loc[:, "Diff Temp (%)"] = (loaded_predictionsVIS.loc[:, "Pred. Temperature"]\
-loaded_predictionsVIS.loc[:, "True Temperature"])/loaded_predictionsVIS.loc[:, "Pred. Temperature"]
loaded_predictionsVIS.loc[:, "Diff Log(g) (%)"] = (loaded_predictionsVIS.loc[:, "Pred Log(g)"]\
-loaded_predictionsVIS.loc[:, "True Log(g)"])/loaded_predictionsVIS.loc[:, "Pred Log(g)"]
loaded_predictionsVIS.loc[:, "Diff Met (%)"] = (loaded_predictionsVIS.loc[:, "Pred Metallicity"]\
-loaded_predictionsVIS.loc[:, "True Metallicity"])/loaded_predictionsVIS.loc[:, "Pred Metallicity"]
diff_log = copy.deepcopy(loaded_predictionsVIS)
# Keep only the data within +2 to -2 of standard deviations.
diff_log = diff_log[(np.abs(diff_log["Diff Temp (%)"]-diff_log["Diff Temp (%)"].mean())\
<= (2*diff_log["Diff Temp (%)"].std()))]
diff_log = diff_log[(np.abs(diff_log["Diff Log(g) (%)"]-diff_log["Diff Log(g) (%)"].mean())\
<= (2*np.abs(diff_log["Diff Log(g) (%)"]).std()))]
diff_log = diff_log[(np.abs(diff_log["Diff Met (%)"]-diff_log["Diff Met (%)"].mean())\
<= (2*diff_log["Diff Met (%)"].std()))]
diff_log = diff_log[np.abs(diff_log.loc[:, "Diff Log(g) (%)"]) <= 5.0] # manually remove three values that are x7 bigger
display(diff_log.head(20))
print("VIS")
print("=================")
print()
# Print the Mean error
print("MSE for Temperature: {0:2.6f}".format(np.power(diff_log["Diff Temp (%)"].mean(), 2)))
print("MSE for Gravity: {0:2.6f}".format(np.power(diff_log["Diff Log(g) (%)"].mean(), 2)))
print("MSE for Metallicity: {0:2.6f}".format(np.power(diff_log["Diff Met (%)"].mean(), 2)))
print()
# Print the median error
print("Median error for Temperature: {0:2.6f}".format(np.abs(diff_log["Diff Temp (%)"].median())))
print("Median error for Gravity: {0:2.6f}".format(np.abs(diff_log["Diff Log(g) (%)"].median())))
print("Median error for Metallicity: {0:2.6f}".format(np.abs(diff_log["Diff Met (%)"].median())))
# Clear the session as a preperation for the next one.
backend.clear_session()
def get_CARMENES_norm_Info(NIR_or_VIS):
""" Returns the file paths and names for the pre-processed Carmenes data.
NIR_or_VIS is a string indicating the desired output ("NIR" or "VIS").
"""
# Get the directory for each .dat file and store them in file_paths
# Choose directory depending on the OS I'm running
if "Darwin" in os.uname():
directory = "/Users/jakob/Desktop/Master Project/git/carmenes_normalised/"
else:
directory = "/home/salomonsson/Master Project/git/carmenes_normalised/"
carmenes_norm_paths = glob(directory + "*.dat")
carmenes_norm_paths_NIR, carmenes_norm_paths_VIS = [], []
for path in carmenes_norm_paths:
if "NIR" in path.upper():
carmenes_norm_paths_NIR.append(path)
elif "VIS" in path.upper():
carmenes_norm_paths_VIS.append(path)
else:
pass
# Sort the file paths
carmenes_norm_paths_NIR = sorted(carmenes_norm_paths_NIR, key=sorted_nicely_short)
carmenes_norm_paths_VIS = sorted(carmenes_norm_paths_VIS, key=sorted_nicely_short)
# Get the file names
carmenes_norm_NAMES_NIR, carmenes_norm_NAMES_VIS = [], []
for path in carmenes_norm_paths_NIR:
carmenes_norm_NAMES_NIR.append(str(path)[57:-4])
for path in carmenes_norm_paths_VIS:
carmenes_norm_NAMES_VIS.append(str(path)[57:-4])
# Return NIR or VIS paths and names
if NIR_or_VIS.upper() == "NIR":
return carmenes_norm_paths_NIR, carmenes_norm_NAMES_NIR
elif NIR_or_VIS.upper() == "VIS":
return carmenes_norm_paths_VIS, carmenes_norm_NAMES_VIS
else:
print('The input is not correct. Enter "VIS" or "NIR" as strings. ')
carmenes_norm_paths_NIR, carmenes_norm_NAMES_NIR = get_CARMENES_norm_Info("NIR")
carmenes_norm_paths_VIS, carmenes_norm_NAMES_VIS = get_CARMENES_norm_Info("VIS")
def create_CARMENES_MATRIX_path(name, NIR_or_VIS):
"""
Create file path to store files. 'name' and 'NIR_or_VIS' are strings.
"""
if NIR_or_VIS.upper() == "NIR":
tp = "NIR/"
elif NIR_or_VIS.upper() == "VIS":
tp = "VIS/"
else:
print('Please enter "NIS" or "VIR" as second input parameter.')
if "Darwin" in os.uname():
base = "/Users/jakob/Desktop/Master Project/POWER_matrices_CARMENES/" + tp
else:
base = "/home/salomonsson/Master Project/POWER_matrices_CARMENES/" + tp
return(base + name + ".csv")
def create_CARMENES_path(name, NIR_or_VIS):
""" Create file path to store files"""
if NIR_or_VIS.upper() == "NIR":
tp = "NIR/"
elif NIR_or_VIS.upper() == "VIS":
tp = "VIS/"
else:
print('Please enter "NIS" or "VIR" as second input parameter.')
if "Jakobs-MBP.home" in os.uname():
base = "/Users/jakob/Desktop/Master Project/git/CARMENES_for_Matrices/" +tp
else:
base = "/home/salomonsson/Master Project/git/CARMENES_for_Matrices/" + tp
return(base + name + ".dat")
print(len(carmenes_norm_paths_VIS))
print(len(carmenes_norm_paths_NIR))
# Make some local copies
carmenes_data_VIS = copy.deepcopy(all_loaded_data_VIS_average)
carmenes_data_NIR = copy.deepcopy(all_loaded_data_NIR_average)
def get_prepared_Carmenes_for_Matrix(carmenes_data, x, VIS_or_NIR):
"""
Returns the Carmenes data stored in position "number".
NIR_or_VIS is a string, "NIR" or "VIS", indicating the desired output
"number" is a number between 0 and 8 indicating which one of the 9 Carmenes files to return.
"""
if VIS_or_NIR.upper() == "NIR":
wave = "NIR"
elif VIS_or_NIR.upper() == "VIS":
wave = "VIS"
else:
print("Something went wrong, add VIS_or_NIR")
nv, R, order = 50, 82000, 0
name = str(carmenes_data[x][0].index.names)[-45:-10]
for m in range(len(carmenes_data[x])):
small_list = []
for n in range(len(carmenes_data[x][m])):
"""Perform the necessary calculations"""
# Get the Angstrom value at position "n" in the Carmenes data and store it
use_this = float(copy.deepcopy(carmenes_data[x][m].iloc[n,0]))
ar = carmenes_data[x][m].iloc[:,0]
# Get the corresponding index
itemindex = np.where(ar==use_this)[0]
# Convert to integers and floats
ind = int(itemindex[int(0)])
use_this = float(use_this)
# Convolution intervals
xsg1 = ind - nv
xsg2 = ind + nv
if xsg1 < 0:
xsg1 = 0
if xsg2 > len(carmenes_data[x][m]):
xsg2 = len(carmenes_data[x][m])
# Apply these intervals to the BT-Settl data
xsgA = carmenes_data[x][m]["Angstrom"].iloc[xsg1:(xsg2)]
xsgF = carmenes_data[x][m]["Flux"].iloc[(xsg1):xsg2]
# Calculate the impact from the chosen BT-Settl flux values
xt = copy.deepcopy(use_this)
sgm = float(use_this/(R*2*np.sqrt(2.*np.log(2.))))
flt = np.exp(-(xsgA - xt)**2/(2*sgm**2))/(np.sqrt(2*np.pi)*sgm)
# Make the arrays compatible for multiplication
if len(np.diff(xsgA2)) != len(flt):
flt = flt[:-1]
if len(np.diff(xsgA2)) != len(xsgF):
xsgF = xsgF[:-1]
# Calculate the sum of this impact, reverse xsgF before multiplying
xsgA2 = carmenes_data[x][m]["Angstrom"].iloc[(xsg1):(xsg2)]
the_sum = np.sum(np.diff(xsgA2)*flt*xsgF[::-1])
# Add the newly calculated flux to the Carmenes data on the appropriate position
temp = copy.deepcopy(carmenes_data[x][m].iloc[n,:])
temp["Flux"] = the_sum
small_list.append(temp)
# Add some space
temp_list = []
temp_list.append(' ')
temp_list.append(' ')
temp_list.append("#order: {0}".format(order))
pd.DataFrame(temp_list).to_csv(create_CARMENES_path(name, wave),
header=False, index=False, mode="a")
order += 1
# Create the dataframe
df = pd.DataFrame(small_list)
# Convert the df to numerical values
df = df.apply(pd.to_numeric, errors='ignore')
# Calculate the Rolling mean for the Flux and equal the "area below" to 1.
temp_df = (df["Flux"].rolling(2).mean()[1:]*(np.diff(df["Angstrom"])))
df["Flux"] = df["Flux"]/temp_df.sum()
# Write to file
df.to_csv(create_CARMENES_path(name, wave), header=False, index=False, mode="a")
# Create CARMENES VIS files
#for i in range(len(carmenes_data_VIS)):
get_prepared_Carmenes_for_Matrix(carmenes_data_VIS, i, "VIS")
# Create CARMENES NIR files
#for i in range(len(carmenes_data_NIR)):
get_prepared_Carmenes_for_Matrix(carmenes_data_NIR, i, "NIR")
def get_CARMENES_READY_Info(NIR_or_VIS):
""" Returns the file paths and names for the pre-processed CARMENES data.
'NIR_or_VIS' is a string, either 'NIR' or 'VIS'. """
if NIR_or_VIS.upper() == "NIR":
tp = "NIR/"
elif NIR_or_VIS.upper() == "VIS":
tp = "VIS/"
else:
print('The input is not correct. Enter "VIS" or "NIR" as strings. ')
# Get the directory for each .dat file and store them in file_paths
# Choose directory depending on the OS I'm running
if "Darwin" in os.uname():
directory = "/Users/jakob/Desktop/Master Project/git/CARMENES_for_Matrices/" + tp
else:
directory = "/home/salomonsson/Master Project/git/CARMENES_for_Matrices/" + tp
carmenes_READY_paths = glob(directory + "*.dat")
# Get the .dat file names
carmenes_READY_names = []
for root, dirs, files in os.walk(directory):
for filename in files:
if filename.endswith(".dat"):
filename = filename[:-4] # Remove unneccessary "common" information from the name
carmenes_READY_names.append(filename)
return carmenes_READY_names, carmenes_READY_paths
# This function needed to be rewritten and changed even though it does the same thing on very similar input data as the
# "load_all_data" function used in the very beginning of this document. I havent found a reason for why finding all
# the "#" didnt worked as before.
def load_CARMENES_READY_data(name):
""" Returns the btsettl dataframe with the name "name". """
if "NIR" in name.upper():
paths = carmenes_READY_paths_NIR
elif "VIS" in name.upper():
paths = carmenes_READY_paths_VIS
else:
print('Input is incorrect. Enter a file name.')
for path in paths:
if name in path:
data = pd.read_table(path, sep=",", header=None, skiprows=3)
data.index.names = [name] # Add the file name to the dataframe
#limits = (data[data.values == '#']) # Find all the indeces with a "#" (Not working!)
limits = {'# order: 0': 0}
for i in range(len(data)):
if "#" in data.iloc[i,0]:
limits[data.iloc[i,0]] = data.index[i]
keys = [key for key, value in limits.items()]
values = [int(value) for key, value in limits.items()]
data_pieces, count = [], 0
for i in range(len(limits)):
start = values[count]
if count == (len(limits)-1):
data_piece = data.iloc[start:,:] # The last data piece contains the last order in the df data
else:
data_piece = data.iloc[start:values[count+1],:] # Select the correct data piece
data_piece.index.names = [str(data_piece.index.names)[2:-2] + "_Order_" + str(count)] # add order to name
data_piece.columns = ["Angstrom", "Flux"]
data_piece = data_piece.dropna(inplace=False) # Remove NaN-values
#data_piece.dropna(inplace=True) # Not like this to avoid a "SettingWithCopyWarning" error
data_pieces.append(data_piece)
count += 1
return data_pieces
#test_data = load_CARMENES_READY_data(carmenes_READY_names_NIR[0])
# A minor change compared to previous "get_POWER_matrix" where
# the matrices have to be Resized to match for concatenation
def get_FULL_POWER_matrix_Carmenes(data):
""" Returns the power matrix, the scale indices and the Fourier frequencies of "data".
All orders included.
"""
power_matrix, full_scales, full_freqs = np.array([1]), np.array([1]), np.array([1])
for i in range(len(data)):
power, scales, freqs = get_POWER_matrix(data[i].loc[:, "Flux"])
if len(power_matrix) > 10: # if no power matrix is created
# Resize "power" to match "power_matrix"'s dimensions.
power = skimage.transform.resize(power, (power.shape[0], power_matrix.shape[1]),
mode='constant', preserve_range=True)
power_matrix = np.append(power_matrix, power, axis=0)
full_scales = np.append(full_scales, scales, axis=0)
full_freqs = np.append(full_freqs, freqs, axis=0)
else:
power_matrix = np.array(power)
full_scales = np.array(scales)
full_freqs = np.array(freqs)
return power_matrix, full_scales, full_freqs
# Make sure the equal amount of orders exists in all NIR and VIS files respectively.
NIR_OK, VIS_OK = "NO", "NO"
if check_orders(all_loaded_data_NIR) == True:
NIR_OK = "YES"
if check_orders(all_loaded_data_VIS) == True:
VIS_OK = "YES"
### Each NIR file takes some 20 seconds to process. 335 min in total ###
carmenes_READY_names_NIR, carmenes_READY_paths_NIR = get_CARMENES_READY_Info("NIR")
if "Darwin" in os.uname():
d = "/Users/jakob/Desktop/Master Project/POWER_matrices_CARMENES/NIR/"
else:
d = "/home/salomonsson/Master Project/POWER_matrices_CARMENES/NIR/"
for dirpath, dirnames, files in os.walk(d): # Get the amount of NIR files in the directory d
nir_files = []
for f in files:
if "NIR" in f.upper():
nir_files.append(f)
if len(nir_files) < len(carmenes_READY_names_NIR): # if all the VIS files havent been "converted" to power matrices
start1 = time.time()
if NIR_OK == "YES":
for i in range(len(carmenes_READY_names_NIR)): #len(btsettl_READY_names_NIR)
NIR_name = carmenes_READY_names_NIR[i]
NIR_d = load_CARMENES_READY_data(NIR_name)
NIR_m = get_FULL_POWER_matrix_Carmenes(NIR_d)[0] # Get the Flux's power matrix
NIR_path = create_CARMENES_MATRIX_path(NIR_name, 'NIR')
pd.DataFrame(NIR_m, dtype='h').to_csv(NIR_path, header=True, index=False, mode="w") # dtype = short
# Check how long time it took.
end1 = time.time()
tt1 = end1 - start1
print("Total run time: {0:0.0f} min and {1:3.0f} s ".format((tt1-tt1%60)/60, tt1%60))
else:
print("All the power matrices for the CARMENES NIR files are already in the directory")
### Each VIS file takes some 30 seconds to process. ###
carmenes_READY_names_VIS, carmenes_READY_paths_VIS = get_CARMENES_READY_Info("VIS")
if "Darwin" in os.uname():
d = "/Users/jakob/Desktop/Master Project/POWER_matrices_CARMENES/VIS/"
else:
d = "/home/salomonsson/Master Project/POWER_matrices_CARMENES/VIS/"
for dirpath, dirnames, files in os.walk(d): # Get the amount of VIS files in the directory d
vis_files = []
for f in files:
if "VIS" in f.upper():
vis_files.append(f)
if len(vis_files) < len(carmenes_READY_names_VIS): # if all the VIS files havent been "converted" to power matrices
start1 = time.time()
if VIS_OK == "YES":
for i in range(len(carmenes_READY_names_VIS)):
VIS_name = carmenes_READY_names_VIS[i]
VIS_d = load_CARMENES_READY_data(VIS_name)
VIS_m = get_FULL_POWER_matrix_Carmenes(VIS_d)[0]
VIS_path = create_CARMENES_MATRIX_path(VIS_name, 'VIS')
pd.DataFrame(VIS_m, dtype='h').to_csv(VIS_path, header=True, index=False, mode="w") # dtype = short
# Check how long time it took.
end1 = time.time()
tt1 = end1 - start1
print("Total run time: {0:0.0f} min and {1:3.0f} s ".format((tt1-tt1%60)/60, tt1%60))
else:
print("All the power matrices for the CARMENES VIS files are already in the directory")
def get_MATRIX_Carmenes_paths(NIR_or_VIS):
""" Return all the matrix paths for 'NIR' or 'VIS'. """
if NIR_or_VIS.upper() == "NIR":
tp = "NIR/"
elif NIR_or_VIS.upper() == "VIS":
tp = "VIS/"
else:
print('Please enter "NIS" or "VIR".')
if "Darwin" in os.uname():
directory = "/Users/jakob/Desktop/Master Project/POWER_matrices_CARMENES/" + tp
else:
directory = "/home/salomonsson/Master Project/POWER_matrices_CARMENES/" + tp
return glob(directory + "*.csv")
def get_Carmenes_Matrix(path):
""" Returns the matrix and its name stored in 'path'.
"""
matrix = pd.read_csv(path) # load the power matrix
filename = os.path.basename(path)[:-4]
return matrix.values, filename
# Create MinMax scaler to sunscale the data
param_scaler = MinMaxScaler(feature_range=(0,1), copy=True)
mat, param, name = get_Matrix_and_Parameters(get_MATRIX_paths("NIR")[0])
print(param)
scaled_param = param_scaler.fit_transform(param.reshape(-1,1))
print(scaled_param)
# Load the best performing NIR model
if "Darwin" in os.uname():
model_path = "/Users/jakob/Desktop/Master Project/Models/NIR_100epochs_Matrix_DivOn2_Sequence.hdf5"
pred_path = "/Users/jakob/Desktop/Master Project/Predictions/CARMENES/NIR_predictions_CARMENES.xlsx"
else:
start = "/home/salomonsson/Master Project/Best_models/NIR/"
model_path = start+"NIR_100epochs_Matrix_DivOn2_Sequence_MAIN_FINAL_MODEL/NIR_100epochs_Matrix_DivOn2_Sequence.hdf5"
pred_path = "/home/salomonsson/Master Project/Predictions/CARMENES/NIR_predictions_CARMENES.xlsx"
NIR_model.load_weights(model_path)
np.set_printoptions(precision=3)
list_for_saving = []
down_size = 2
carMat_paths = get_MATRIX_Carmenes_paths("NIR")
print("Parameters are printed as [Temperature, Log(G), Metallicity]")
print("---------------------------------------------------------------------------------------------")
print(" ")
for path in range(9):
# Load the matrix, parameters and the name. Resize the matrix
mat, name = get_Carmenes_Matrix(carMat_paths[path])
mat = skimage.transform.resize(mat, (mat.shape[0]//down_size, mat.shape[1]//down_size, 3),
mode='constant', preserve_range=True)
# Reshape the matrix to "[batch, 3D RGB image]"
mat = np.reshape(mat, (1, mat.shape[0], mat.shape[1], mat.shape[2]))
# Rescale the matrix to [0,1]
mat = normalise(mat)
# Use the model to predict the parameters.
predicted_parameters = NIR_model.predict(mat)
# Use the fitted scaler to unscale the parameters to their original interval
unscaled_pred_param = param_scaler.inverse_transform(predicted_parameters)
# Prepare for saving to file
d = pd.DataFrame(unscaled_pred_param.reshape(1,-1))
dfName = pd.DataFrame([name])
d1 = pd.concat([dfName, d], axis=1)
#list_for_saving.append(pd.DataFrame([name]))
list_for_saving.append(d1)
# Print the results (transpose the unscaled predicted parameters for readability).
print("Matrix: {0}".format(name))
print("Predicted Parameters: {0}".format(unscaled_pred_param[0]))
print()
# Save to file
d2 = pd.concat(list_for_saving, axis=0)
columns = ["Matrix name", "Predicted Temperature", "Predicted Log(g)", "Predicted Metallicity"]
d2.columns = columns
d2.to_csv(pred_path, header=True, index=False, mode="w")
print()
print("End")
# Load best performing NIR model
if "Darwin" in os.uname():
model_path = "/Users/jakob/Desktop/Master Project/Models/VIS_150epochs_Matrix_DivOn3_batch2_Sequence.hdf5"
pred_path = "/Users/jakob/Desktop/Master Project/Predictions/CARMENES/VIS_predictions_CARMENES.xlsx"
else:
start = "/home/salomonsson/Master Project/Best_models/VIS/"
model_path = start+"VIS_150epochs_Matrix_DivOn3_batch2_Sequence/VIS_150epochs_Matrix_DivOn3_batch2_Sequence.hdf5"
pred_path = "/home/salomonsson/Master Project/Predictions/CARMENES/VIS_predictions_CARMENES.xlsx"
VIS_model.load_weights(model_path)
mat, name = get_Carmenes_Matrix(carMat_paths[0])
mat.shape
7747//down_size
np.set_printoptions(precision=3)
list_for_saving = []
down_size = 3
carMat_paths = get_MATRIX_Carmenes_paths("VIS")
print("Parameters are printed as [Temperature, Log(G), Metallicity]")
print("---------------------------------------------------------------------------------------------")
print(" ")
for path in range(9):
# Load the matrix, parameters and the name. Resize the matrix to fit into the trained model
mat, name = get_Carmenes_Matrix(carMat_paths[path])
mat = skimage.transform.resize(mat, (7747//down_size, mat.shape[1]//down_size, 3),
mode='constant', preserve_range=True)
# Reshape the matrix to "[batch, 3D RGB image]"
mat = np.reshape(mat, (1, mat.shape[0], mat.shape[1], mat.shape[2]))
# Rescale the matrix to [0,1]
mat = normalise(mat)
# Use the model to predict the parameters.
predicted_parameters = VIS_model.predict(mat)
# Use the fitted scaler to unscale the parameters to their original interval
unscaled_pred_param = param_scaler.inverse_transform(predicted_parameters)
# Prepare for saving to file
d = pd.DataFrame(unscaled_pred_param.reshape(1,-1))
dfName = pd.DataFrame([name])
d1 = pd.concat([dfName, d], axis=1)
list_for_saving.append(d1)
# Print the results (transpose the unscaled predicted parameters for readability).
print("Matrix: {0}".format(name))
print("Predicted Parameters: {0}".format(unscaled_pred_param[0]))
print()
# Save to file
d2 = pd.concat(list_for_saving, axis=0)
columns = ["Matrix name", "Predicted Temperature", "Predicted Log(g)", "Predicted Metallicity"]
d2.columns = columns
d2.to_csv(pred_path, header=True, index=False, mode="w")
print()
print("End")